**Chapter 13 – Loading and Preprocessing Data with TensorFlow**

_This notebook contains all the sample code and solutions to the exercises in chapter 13._

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/ageron/handson-ml2/blob/master/13_loading_and_preprocessing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
  <td>
    <a target="_blank" href="https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml2/blob/master/13_loading_and_preprocessing_data.ipynb"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" /></a>
  </td>
</table>

# Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0.

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB or IS_KAGGLE:
    %pip install -q -U tfx
    print("You can safely ignore the package incompatibility errors.")

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

[K     |████████████████████████████████| 2.5 MB 5.3 MB/s 
[K     |████████████████████████████████| 1.7 MB 52.7 MB/s 
[K     |████████████████████████████████| 133 kB 60.3 MB/s 
[K     |████████████████████████████████| 23.6 MB 1.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 31.3 MB/s 
[K     |████████████████████████████████| 10.1 MB 50.1 MB/s 
[K     |████████████████████████████████| 206 kB 65.9 MB/s 
[K     |████████████████████████████████| 147 kB 72.9 MB/s 
[K     |████████████████████████████████| 135 kB 60.6 MB/s 
[K     |████████████████████████████████| 1.4 MB 46.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 53.4 MB/s 
[K     |████████████████████████████████| 49 kB 5.2 MB/s 
[K     |████████████████████████████████| 40 kB 4.7 MB/s 
[K     |████████████████████████████████| 1.8 MB 49.4 MB/s 
[K     |████████████████████████████████| 19.2 MB 1.2 MB/s 
[K     |████████████████████████████████| 1.8 MB 33.2 MB/s 
[K     |███████████████████

# Datasets

In [112]:
n_objects, n_features = int(1E3), 10
X = tf.random.normal(shape=(n_objects, n_features), stddev = 5)
y = tf.random.normal(shape=(n_objects, ), mean = -10, stddev = 10)

dataset = tf.data.Dataset.from_tensor_slices((X, y))

tf.nn.moments(X, axes = [0,1])

(<tf.Tensor: shape=(), dtype=float32, numpy=-0.039994154>,
 <tf.Tensor: shape=(), dtype=float32, numpy=25.092596>)

In [118]:


for i, (x_s, y_s) in enumerate(dataset.take(5), start = 1):
    print(f"""
    X[{i}] = {x_s}
    y[{i}] = {y_s}
    {'-'*54}
    """)


    X[1] = [ -1.2064817    3.4530177    8.105913    -4.346487    -0.61183995
 -12.2997875   15.912649     0.10526749  -3.4976225   -3.8707557 ]
    y[1] = -17.507362365722656
    ------------------------------------------------------
    

    X[2] = [ 4.2608895 -1.92782   -2.006689  -7.8956566  4.843286  14.662643
  5.0025234 -6.908925  -1.2007484 -3.719258 ]
    y[2] = -11.026020050048828
    ------------------------------------------------------
    

    X[3] = [ 1.7640494  5.721517   5.7136292  2.9074314 -1.114125   1.3205807
 -0.6651038  1.0006337 -3.729916  -1.3796635]
    y[3] = -15.632390975952148
    ------------------------------------------------------
    

    X[4] = [-8.404918  -1.0587999  6.450649   3.1960282  4.5629787  1.1638559
 -4.6865726  6.3556633 -3.4738255 -3.0593262]
    y[4] = -17.387359619140625
    ------------------------------------------------------
    

    X[5] = [ 3.8877769  5.6829305 -5.8341937 -1.5963126  8.230314   2.6893818
  5.801244  -1.4829756

In [119]:
len(dataset.batch(7)) # 1000 // 7 = 142

143

In [138]:
print(*dataset.batch(27).unbatch().take(1))

(<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([-5.2476311e+00, -1.3515381e+01,  4.3755951e+00,  9.0988150e+00,
        4.7497540e-03, -2.3433008e+00, -2.0487716e+00, -2.0707195e+00,
        4.8688519e-01, -4.6002836e+00], dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-6.531117>)


In [155]:
len(dataset.repeat(3)) # 3*1000

3000

In [130]:
dataset = dataset.shuffle(buffer_size=10, seed = 54) # buffer_size = 10

In [None]:
mapped_dataset = dataset.map(lambda x, y: (0.5 * (x[:5] + x[5:])**2, y))
print(*mapped_dataset.take(5), sep=f"\n{'-'*54}\n")

In [151]:
print(*dataset.filter(lambda x, y: tf.norm(x) < 10).take(1))

(<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 1.7640494,  5.721517 ,  5.7136292,  2.9074314, -1.114125 ,
        1.3205807, -0.6651038,  1.0006337, -3.729916 , -1.3796635],
      dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-15.632391>)


In [96]:
batched_ds = dataset.batch(54)
for batch in batched_ds.take(5):
    for i in batch:
        print(f"I = {i.shape}")

I = (54, 10)
I = (54,)
I = (54, 10)
I = (54,)
I = (54, 10)
I = (54,)
I = (54, 10)
I = (54,)
I = (54, 10)
I = (54,)
