**Chapter 13 – Loading and Preprocessing Data with TensorFlow**

_This notebook contains all the sample code and solutions to the exercises in chapter 13._

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/ageron/handson-ml2/blob/master/13_loading_and_preprocessing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
  <td>
    <a target="_blank" href="https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml2/blob/master/13_loading_and_preprocessing_data.ipynb"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" /></a>
  </td>
</table>

# Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0.

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB or IS_KAGGLE:
    %pip install -q -U tfx
    print("You can safely ignore the package incompatibility errors.")

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

[K     |████████████████████████████████| 2.5 MB 4.3 MB/s 
[K     |████████████████████████████████| 10.1 MB 19.8 MB/s 
[K     |████████████████████████████████| 433 kB 48.4 MB/s 
[K     |████████████████████████████████| 135 kB 66.6 MB/s 
[K     |████████████████████████████████| 1.8 MB 57.8 MB/s 
[K     |████████████████████████████████| 133 kB 45.9 MB/s 
[K     |████████████████████████████████| 147 kB 49.2 MB/s 
[K     |████████████████████████████████| 206 kB 53.7 MB/s 
[K     |████████████████████████████████| 1.4 MB 44.8 MB/s 
[K     |████████████████████████████████| 23.6 MB 60.6 MB/s 
[K     |████████████████████████████████| 40 kB 4.0 MB/s 
[K     |████████████████████████████████| 1.3 MB 42.1 MB/s 
[K     |████████████████████████████████| 1.8 MB 36.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 23.5 MB/s 
[K     |████████████████████████████████| 49 kB 2.1 MB/s 
[K     |████████████████████████████████| 19.2 MB 1.4 MB/s 
[K     |██████████████████

# Datasets

## Basics

In [2]:
n_objects, n_features = int(1E3), 10
X = tf.random.normal(shape=(n_objects, n_features), stddev = 5)
y = tf.random.normal(shape=(n_objects, ), mean = -10, stddev = 10)

dataset = tf.data.Dataset.from_tensor_slices((X, y))

tf.nn.moments(X, axes = [0,1])

(<tf.Tensor: shape=(), dtype=float32, numpy=0.0544856>,
 <tf.Tensor: shape=(), dtype=float32, numpy=25.59864>)

In [3]:


for i, (x_s, y_s) in enumerate(dataset.take(5), start = 1):
    print(f"""
    X[{i}] = {x_s}
    y[{i}] = {y_s}
    {'-'*54}
    """)


    X[1] = [ 0.5097965  -4.8581986  -0.81164867  6.190033   -3.458878    0.43297905
 -0.57776046  8.867246    7.598179    9.348671  ]
    y[1] = -17.285953521728516
    ------------------------------------------------------
    

    X[2] = [ 4.6078663   2.5444815   0.63235575  5.934783    1.3360726  -8.986338
 -9.014416    1.2781007   1.1534915   9.917941  ]
    y[2] = -19.71023941040039
    ------------------------------------------------------
    

    X[3] = [ -4.7229524   -2.095       -0.18447204  -0.6224143    1.0430775
   0.6071024    0.30097777  -5.0319533   -3.558369   -14.671318  ]
    y[3] = -14.945405006408691
    ------------------------------------------------------
    

    X[4] = [ 7.1795945  -2.5722125   6.0105424  -0.10796881 -1.1245143  -5.535875
 -0.83877206  2.671073   -1.2804797  -6.9400854 ]
    y[4] = 3.21343994140625
    ------------------------------------------------------
    

    X[5] = [ 2.6259732  -0.5168303   0.82936615 -5.1852484   2.0407796  -0.387

In [4]:
len(dataset.batch(7)) # 1000 // 7 = 142

143

In [5]:
print(*dataset.batch(27).unbatch().take(1))

(<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.5097965 , -4.8581986 , -0.81164867,  6.190033  , -3.458878  ,
        0.43297905, -0.57776046,  8.867246  ,  7.598179  ,  9.348671  ],
      dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-17.285954>)


In [6]:
len(dataset.repeat(3)) # 3*1000

3000

In [7]:
dataset = dataset.shuffle(buffer_size=10, seed = 54) # buffer_size = 10

In [8]:
mapped_dataset = dataset.map(lambda x, y: (0.5 * (x[:5] + x[5:])**2, y))
print(*mapped_dataset.take(5), sep=f"\n{'-'*54}\n")

(<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([  3.1171443,  31.452534 ,  15.786334 , 112.349846 ,  35.588203 ],
      dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-5.9586744>)
------------------------------------------------------
(<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([229.67265 ,  20.949066,  24.612728,  51.211483,  10.367193],
      dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-15.388119>)
------------------------------------------------------
(<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([ 0.44441286, 14.774824  , 32.446323  , 95.057396  , 17.34483   ],
      dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-17.285954>)
------------------------------------------------------
(<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([ 4.0931463 , 59.02788   ,  5.096079  ,  0.74867976, 34.953773  ],
      dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-14.428833>)
--------------------------------------------

In [9]:
print(*dataset.filter(lambda x, y: tf.norm(x) < 10).take(1))

(<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 2.6259732 , -0.5168303 ,  0.82936615, -5.1852484 ,  2.0407796 ,
       -0.38768512, -3.3539722 , -1.4524497 , -2.2390363 , -3.1496148 ],
      dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-23.784569>)


In [12]:
batched_ds = dataset.batch(54)
for batch in batched_ds.take(5):
    for i in batch:
        print(f"I = {i.shape}")

I = (54, 10)
I = (54,)
I = (54, 10)
I = (54,)
I = (54, 10)
I = (54,)
I = (54, 10)
I = (54,)
I = (54, 10)
I = (54,)


## Split the California dataset to multiple CSV files

In [13]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pandas as pd

In [19]:
housing = fetch_california_housing()

X, y = dict(), dict()

X['train'], X['test'], y['train'], y['test'] = train_test_split(
    housing['data'], housing['target'].reshape(-1,1),
    random_state = 54
)

X['train'], X['val'], y['train'], y['val'] = train_test_split(
    X['train'], y['train'],
    random_state = 54
)


(3870, 1)

In [27]:
X['val'][0]

array([   4.6071    ,   52.        ,    6.03018868,    1.0754717 ,
        689.        ,    2.6       ,   37.75      , -122.47      ])

In [20]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [35]:
data = {split : np.c_[X[split], y[split]] for split in ('train', 'val', 'test')}
header = ",".join(housing.feature_names + ["MedianHouseValue"])

filepaths = {split: save_to_multiple_csv_files(data[split], split, header, n_parts = 20) for split in data.keys()
             }
filepaths['val'][:5]

['datasets/housing/my_val_00.csv',
 'datasets/housing/my_val_01.csv',
 'datasets/housing/my_val_02.csv',
 'datasets/housing/my_val_03.csv',
 'datasets/housing/my_val_04.csv']

In [None]:
filepath_dataset = 