In [1]:
import tensorflow as tf
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
#shuffling the data prevents from learning spurious patterns in the training data. 
#This also improves the convergence of gradient based methods, such as training a neural network using Gradient Descent. To shuffle the data, use the shuffle() method

In [5]:
file_path = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
dataset = tf.data.TextLineDataset(file_path)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv


In [10]:
data = tf.data.TextLineDataset(file_path)
counter = tf.data.experimental.Counter()

dataset = tf.data.Dataset.zip((counter, data))
dataset = dataset.batch(20)


In [11]:
n,line_batch = next(iter(dataset))
print(n.numpy())

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [12]:
data = tf.data.TextLineDataset(file_path)
counter = tf.data.experimental.Counter()

dataset = tf.data.Dataset.zip((counter, data))
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(20)
dataset

<BatchDataset shapes: ((None,), (None,)), types: (tf.int64, tf.string)>

In [13]:
n,line_batch = next(iter(dataset))
print(n.numpy())

[ 69   7   5  10  52  84  63  70  92  25 107   0  35 110 103   2  86  48
  80  68]


In [None]:
#time series windowing

In [25]:
ds = tf.data.Dataset.range(50)

In [26]:
batches = ds.batch(10, drop_remainder=True)

for batch in batches.take(5):
  print(batch.numpy())

[0 1 2 3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17 18 19]
[20 21 22 23 24 25 26 27 28 29]
[30 31 32 33 34 35 36 37 38 39]
[40 41 42 43 44 45 46 47 48 49]


In [79]:
def dense_1_step(batch):
  return batch[:8], batch[3:]

predict_dense_1_step = batches.map(dense_1_step)

for features, label in predict_dense_1_step.take(3):
  print(features.numpy(), " => ", label.numpy())

[0 1 2 3 4 5 6 7]  =>  [3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17]  =>  [13 14 15 16 17 18 19]
[20 21 22 23 24 25 26 27]  =>  [23 24 25 26 27 28 29]


In [45]:
feature_length = 10
label_length = 5

features = ds.batch(feature_length, drop_remainder=True)
labels = ds.batch(feature_length).skip(1).map(lambda labels: labels[:3])

predict = tf.data.Dataset.zip((features, labels))

for features, label in predict.take(5):
  print(features.numpy(), " => ", label.numpy())

[0 1 2 3 4 5 6 7 8 9]  =>  [10 11 12]
[10 11 12 13 14 15 16 17 18 19]  =>  [20 21 22]
[20 21 22 23 24 25 26 27 28 29]  =>  [30 31 32]
[30 31 32 33 34 35 36 37 38 39]  =>  [40 41 42]


In [89]:
window_size = 10

windows = ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
  print(sub_ds)

<_VariantDataset shapes: (4,), types: tf.int64>
<_VariantDataset shapes: (4,), types: tf.int64>
<_VariantDataset shapes: (4,), types: tf.int64>
<_VariantDataset shapes: (4,), types: tf.int64>
<_VariantDataset shapes: (4,), types: tf.int64>


In [90]:
for x in windows.flat_map(lambda x: x).take(20):
   print(x.numpy(), end=' ')

Cause: could not parse the source code:

for x in windows.flat_map(lambda x: x).take(20):

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not parse the source code:

for x in windows.flat_map(lambda x: x).take(20):

This error may be avoided by creating the lambda in a standalone statement.

[ 0  5 10 15] [20 25 30 35] [ 5 10 15 20] [25 30 35 40] [10 15 20 25] [30 35 40 45] [15 20 25 30] [20 25 30 35] [25 30 35 40] [30 35 40 45] [20 25 30 35] [ 5 10 15 20] [25 30 35 40] [10 15 20 25] [30 35 40 45] [15 20 25 30] [20 25 30 35] [25 30 35 40] [30 35 40 45] [ 5 10 15 20] 

In [91]:
def sub_to_batch(sub):
  return sub.batch(window_size, drop_remainder=True)

for example in windows.flat_map(sub_to_batch).take(5):
  print(example.numpy())

[[ 0  5 10 15]
 [20 25 30 35]
 [ 5 10 15 20]
 [25 30 35 40]
 [10 15 20 25]
 [30 35 40 45]
 [15 20 25 30]
 [20 25 30 35]
 [25 30 35 40]
 [30 35 40 45]]


In [96]:
ran_ds=tf.data.Dataset.range(500)

In [97]:
def make_window_dataset(ds, window_size=5, shift=1, stride=1):
  windows = ds.window(window_size, shift=shift, stride=stride)
  windows = windows.flat_map(sub_to_batch)
  return windows

In [103]:
ds = make_window_dataset(ran_ds, window_size=10, shift = 2, stride=3)

for example in ds.take(10):
  print(example.numpy())

[ 0  3  6  9 12 15 18 21 24 27]
[ 2  5  8 11 14 17 20 23 26 29]
[ 4  7 10 13 16 19 22 25 28 31]
[ 6  9 12 15 18 21 24 27 30 33]
[ 8 11 14 17 20 23 26 29 32 35]
[10 13 16 19 22 25 28 31 34 37]
[12 15 18 21 24 27 30 33 36 39]
[14 17 20 23 26 29 32 35 38 41]
[16 19 22 25 28 31 34 37 40 43]
[18 21 24 27 30 33 36 39 42 45]


In [104]:
dense_labels_ds = ds.map(dense_1_step)

for inputs,labels in dense_labels_ds.take(3):
  print(inputs.numpy(), "=>", labels.numpy())

[ 0  3  6  9 12 15 18 21] => [ 9 12 15 18 21 24 27]
[ 2  5  8 11 14 17 20 23] => [11 14 17 20 23 26 29]
[ 4  7 10 13 16 19 22 25] => [13 16 19 22 25 28 31]
