In [None]:
## Ch. 13 




In [1]:
# Setup
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tfx==0.21.2
    print("You can safely ignore the package incompatibility errors.")
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "data"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
### 13.1 The Data Api

# create Dataset

X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)

# equivalently
dataset = tf.data.Dataset.range(10)

## 13.1.1 Chaining Transformation

In [4]:

# tf.data methods not modify dataset always creates new dataset
dataset = dataset.repeat(3).batch(7)

# map(each item in dataset), apply & filter(whole dataset)
dataset = dataset.map(lambda x: x * 2)
dataset = dataset.apply(tf.data.experimental.unbatch()) # alreadz deprecated now in tf.data.Dataset
dataset = dataset.filter(lambda x: x < 10)

# to have a look at a piece of dataset

for item in dataset.take(3):
    print(item)


Instructions for updating:
Use `tf.data.Dataset.unbatch()`.
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)


## 13.1.2 Shuffling the Data

In [5]:
# Gradient Descent works best when training set are independent and identically distributed

dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size = 5, seed = 5).batch(7)

for item in dataset:
    print(item)

tf.Tensor([0 2 1 7 8 6 4], shape=(7,), dtype=int64)
tf.Tensor([0 1 3 2 9 4 3], shape=(7,), dtype=int64)
tf.Tensor([5 7 5 6 0 9 1], shape=(7,), dtype=int64)
tf.Tensor([5 3 4 6 2 7 9], shape=(7,), dtype=int64)
tf.Tensor([8 8], shape=(2,), dtype=int64)


In [6]:
# Attention
# if you call repeat() on shuffled dataset, it will be reshuffeled
# if not needed parameter reshuffle_each_iteration = False!


# other shuffle approaches

# shuffle whole source data
# split source and read in parts randomly
# 

### Interleaving lines from multiple files

california housing data




In [31]:
# prepare data

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# get the data
housing = fetch_california_housing()

# split in train/test, then train into train/valid
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

# split file in multiple files (not fit to memory)
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths


train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)


In [32]:
# lets look at those csv

import pandas as pd

pd.read_csv(train_filepaths[0]).head()

# in text mode

with open(train_filepaths[0]) as f:
    for x in range(5):
        print(f.readline(), end="")


MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442
5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687
3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621
7.1736,12.0,6.289002557544757,0.9974424552429667,1054.0,2.6956521739130435,33.55,-117.7,2.621


In [33]:
## Building an Input Pipeline

# 1) combine all filepaths into 1 dataset
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed= 42)

# 2) interleave
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

# takes in 5 .csv, skips the first row, takes a line randomly, goes next .csv until all lines finished.
# then takes 5 .csv again until all .csv parts finished


In [34]:
for line in dataset.take(5):
    print(line.numpy())

b'4.7361,7.0,7.464968152866242,1.1178343949044587,846.0,2.694267515923567,34.49,-117.27,1.745'
b'3.6641,17.0,5.577142857142857,1.1542857142857144,511.0,2.92,40.85,-121.07,0.808'
b'4.5909,16.0,5.475877192982456,1.0964912280701755,1357.0,2.9758771929824563,33.63,-117.71,2.418'
b'3.6875,44.0,4.524475524475524,0.993006993006993,457.0,3.195804195804196,34.04,-118.15,1.625'
b'2.3,25.0,5.828178694158075,0.9587628865979382,909.0,3.1237113402061856,36.25,-119.4,1.328'


In [35]:
# function for preprocessing the values (standardize)
n_inputs = 8 # X_train.shape[-1]

@tf.function

def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype= tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults = defs)
    x = tf.stack(fields[:-1]) # to make a tf vector out of skalar
    y = tf.stack(fields[-1:]) # same, vector with only 1 value
    return (x - X_mean) / X_std, y


In [36]:
# test preproccess function

preprocess(b'3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([-0.19397889, -1.0778131 , -0.9433855 ,  0.01485314,  0.02073333,
        -0.5729162 ,  0.9292612 , -1.4221538 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.442], dtype=float32)>)

### Create Pipeline for preprocessing with all together


In [37]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1) #while still training on batch, preprocessing new batch

In [38]:
tf.random.set_seed(42)

train_set = csv_reader_dataset(train_filepaths, batch_size=3)
for X_batch, y_batch in train_set.take(2):
    print("X =", X_batch)
    print("y =", y_batch)
    print()

X = tf.Tensor(
[[ 0.5804519  -0.20762321  0.05616303 -0.15191229  0.01343246  0.00604472
   1.2525111  -1.3671792 ]
 [ 5.818099    1.8491895   1.1784915   0.28173092 -1.2496178  -0.3571987
   0.7231292  -1.0023477 ]
 [-0.9253566   0.5834586  -0.7807257  -0.28213993 -0.36530012  0.27389365
  -0.76194876  0.72684526]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[1.752]
 [1.313]
 [1.535]], shape=(3, 1), dtype=float32)

X = tf.Tensor(
[[-0.8324941   0.6625668  -0.20741376 -0.18699841 -0.14536144  0.09635526
   0.9807942  -0.67250353]
 [-0.62183803  0.5834586  -0.19862501 -0.3500319  -1.1437552  -0.3363751
   1.107282   -0.8674123 ]
 [ 0.8683102   0.02970133  0.3427381  -0.29872298  0.7124906   0.28026953
  -0.72915536  0.86178064]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[0.919]
 [1.028]
 [2.182]], shape=(3, 1), dtype=float32)



## Using the datasets on an ANN


In [39]:
# 1) create the datasets

train_set = csv_reader_dataset(train_filepaths, repeat = None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

# clear session and set seed for reproducible results

keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

# create model

model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu', input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

In [40]:
# compile

model.compile(loss="mse",
             optimizer=keras.optimizers.SGD(lr = 0.01),
             metrics= [])
# train model

batch_size = 32
model.fit(train_set, steps_per_epoch = len(X_train) // batch_size, epochs = 10,
         validation_data = valid_set) # normally numpy array with (X_train, y_train and (X_valid, y_valid)) now TF DAtaset


Train for 362 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e98467b348>

In [41]:
model.evaluate(test_set, steps = len(X_test) // batch_size)



0.3672311701204466

In [None]:
# predict (we pretend)