In [1]:
import os

from tensorflow.keras import layers
from tensorflow.keras import Model
import tensorflow.keras as tk
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Loading Data as a numpy array

In [2]:
fashion_mnist = tk.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images.shape, train_labels.shape

((60000, 28, 28), (60000,))

In [3]:
# !pip install -U tensorflow_datasets # May need this

## Using the Dataset API
https://www.tensorflow.org/api_docs/python/tf/data/Dataset<br>
The tensorflow datasets api has lots of features, so it is definetly worth it to have a cursory look through the methods outlined in the above link. However, I have included what I find to be the most useful below.

In [4]:
import tensorflow_datasets as tfds

dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

2022-02-19 16:39:07.565700: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-19 16:39:10.443964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9640 MB memory:  -> device: 0, name: GeForce RTX 2080 Ti, pci bus id: 0000:1a:00.0, compute capability: 7.5
2022-02-19 16:39:10.445617: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 9640 MB memory:  -> device: 1, name: GeForce RTX 2080 Ti, pci bus id: 0000:1b:00.0, compute capability: 7.5
2022-02-19 16:39:10.447130: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/r

In [5]:
class_names = metadata.features['label'].names
print("Class names: {}".format(class_names))

Class names: ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']


In [6]:
num_train_examples = metadata.splits['train'].num_examples
num_test_examples = metadata.splits['test'].num_examples
print("Number of training examples: {}".format(num_train_examples))
print("Number of test examples:     {}".format(num_test_examples))

Number of training examples: 60000
Number of test examples:     10000


### Caching
Caching makes it so that data is read and stored after the first iteration (epoch 1), either in memory or some file. In the next pass throught the data it will be read from this location, which should be faster.
**Note: Caching stores data as it it used, so if you are chaining some kind of transformation or function on the data like ".map" then put those before the .cache(). However, put somthing like .shuffle() after the cache, otherwise it will only shuffle in the first iteration. Example below.**

### Batch
Just specifies how many samples the dataset will package and output everytime it is asked for a new batch. Specifying it here means it doesn't need to be specified during the model.fit()

### Map
Simply applys some function/transformation to each element. Very useful for things like normalization or calculations.

### Prefetch
Prefetch is a super useful feature, it allows the next batch of data to be pulled in and prepared while the current batch is being processed by the model. This improves throughput of the training process. Most dataset input pipelines should end with a call to prefetch.

### Shuffle
This loads a specified number of samples into a buffer and then shuffles them before selecting a batch. Selected samples get replaced with new samples from the rest of the dataset. In order for the entire dataset to be shuffled traditionally the buffer_size needs to be equal or greater than the total number of sampels 

**Note** You can often pass "tf.data.AUTOTUNE" as a parameter for many of this dataset functions and Tensorflow will automaitcally try to optimize them.

In [7]:
def normalize(images, labels):
  images = tf.cast(images, tf.float32)
  images /= 255
  return images, labels

train_dataset =  train_dataset.map(normalize).cache().shuffle(200).batch(64).prefetch(tf.data.AUTOTUNE)
test_dataset  =  test_dataset.map(normalize).cache().batch(32).prefetch(tf.data.AUTOTUNE)

In [8]:
# model.fit(train_dataset, epochs=5)

## Loading data from csv

In [9]:
import pandas as pd

titanic = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [10]:
titanic_features = titanic.copy()
titanic_labels = titanic_features.pop('survived')

## **tf.data.Datesets.from_tensor_slices()**
This will probably be used if making your own datasets from "wild" data. So pay attention!

As you are probably familiar with, you can just go ahead and use the titanic_features and titanic_labels as your x and y right now. However, you could also take this a step forward and turn this csv data into an actual tf.data.Dataset to leverage all the useful features that API gives you.

In order to do this the first step is to convert the dataframe into a **dictionary of tensors**. This is a dictionary where the feature names (columns) are the keys and the associated values are the entire array of values for that feature. This is done below:

In [11]:
# Turn dataframe into dictionary of tensors
titanic_features_dict = {name: np.array(value) 
                         for name, value in titanic_features.items()}
# titanic_features_dict

This needs to be done in order to use the tf.data.Datesets.from_tensor_slices() function as shown below

In [12]:
features_dataset_no_labels = tf.data.Dataset.from_tensor_slices(titanic_features_dict)

for example in features_dataset_no_labels:
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

sex                : b'male'
age                : 22.0
n_siblings_spouses : 1
parch              : 0
fare               : 7.25
class              : b'Third'
deck               : b'unknown'
embark_town        : b'Southampton'
alone              : b'n'


tf.data.Datesets.from_tensor_slices() is flexible, here is a way to do it with the labels as part of the new dataset

In [13]:
features_dataset = tf.data.Dataset.from_tensor_slices((titanic_features_dict, titanic_labels))

for example, label in features_dataset:
  print(f'\t They are {"Alive! :)" if label else "Dead :("}')
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

	 They are Dead :(
sex                : b'male'
age                : 22.0
n_siblings_spouses : 1
parch              : 0
fare               : 7.25
class              : b'Third'
deck               : b'unknown'
embark_town        : b'Southampton'
alone              : b'n'


There is also experimental support for creating a dataset directly from a dataset without reading it into memory and turning it into a dataframe first. Read about it here: https://www.tensorflow.org/api_docs/python/tf/data/experimental/make_csv_dataset

#### For very large datasets tensorflow also alows for creating dataset objects that are connected directly to files and can read from the disk.
This is an advance topic but details can be found here: https://www.tensorflow.org/guide/data#basic_mechanics

# tf.keras.layers preprocessing layers
Complete guide here: https://www.tensorflow.org/guide/keras/preprocessing_layers#quick_recipes <br>
Useful examples: https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers

These preprocessing layers allow for things like normalization and one-hot encoding to be done as part of the model itself. This makes the model a truly end-to-end solution and makes it much more portable from platform and environment.

### tf.normalization method

Allows the normalization process to be encorporated into being part of the model, allowing the dataset to be untouched. Normalizing is important because even Neural nets will converge faster if the mean of features is near 0.<br>

In [14]:
# x represents a 11 samples each with 2 features
# r is just there to show the layer can be applied to other input

x = np.linspace(-4, 6, 22).reshape(11, 2)
r = x +1
print(f'x: {x.T}')
print(f'r: {r.T}')

norm_layer = tf.keras.layers.Normalization()
norm_layer.adapt(x)
print(norm_layer(x))
print(norm_layer(r))
norm_layer.mean, norm_layer.variance

x: [[-4.         -3.04761905 -2.0952381  -1.14285714 -0.19047619  0.76190476
   1.71428571  2.66666667  3.61904762  4.57142857  5.52380952]
 [-3.52380952 -2.57142857 -1.61904762 -0.66666667  0.28571429  1.23809524
   2.19047619  3.14285714  4.0952381   5.04761905  6.        ]]
r: [[-3.         -2.04761905 -1.0952381  -0.14285714  0.80952381  1.76190476
   2.71428571  3.66666667  4.61904762  5.57142857  6.52380952]
 [-2.52380952 -1.57142857 -0.61904762  0.33333333  1.28571429  2.23809524
   3.19047619  4.14285714  5.0952381   6.04761905  7.        ]]
tf.Tensor(
[[-1.5811388  -1.5811388 ]
 [-1.264911   -1.264911  ]
 [-0.9486833  -0.9486833 ]
 [-0.6324556  -0.6324556 ]
 [-0.31622776 -0.3162278 ]
 [ 0.          0.        ]
 [ 0.31622776  0.31622773]
 [ 0.6324556   0.63245547]
 [ 0.9486833   0.9486833 ]
 [ 1.2649112   1.2649109 ]
 [ 1.5811388   1.5811388 ]], shape=(11, 2), dtype=float32)
tf.Tensor(
[[-1.2490996  -1.2490996 ]
 [-0.93287194 -0.93287194]
 [-0.61664414 -0.61664414]
 [-0.3004163

2022-02-19 16:39:10.934311: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


(<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.7619048, 1.2380953]], dtype=float32)>,
 <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[9.070294, 9.070294]], dtype=float32)>)

If all of the data is numerical then a normalization layer can just be adapted to the entire training data and simply added as the first layer of the tensorflow model. **Easy**.<br>
However, if some of the features are catagorical then things get a little more complicated. The good news is that there are also preprocessing layers that allow for things like the model automatically converting strings into one hot encodings. The bad news is that doing this will require using the keras **functional api** which is more complicated than the sequential one. If you have not reviewed the **functional vs sequential api** notes yet, it is recommended to understand that first before trying this.

### Keras.Input objects for Symbolic representation
keras.input objects are simply used to create symbolic placeholder Keras tensors. <br>
First, create keras.Input objects to represent each of your features:

In [15]:
inputs = {}

for name, column in titanic_features.items():
  dtype = column.dtype
  if dtype == object:
    dtype = tf.string
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

inputs

{'sex': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'sex')>,
 'age': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>,
 'n_siblings_spouses': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'n_siblings_spouses')>,
 'parch': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'parch')>,
 'fare': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'fare')>,
 'class': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'class')>,
 'deck': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'deck')>,
 'embark_town': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'embark_town')>,
 'alone': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'alone')>}

Then isolate all the numeric features and create a normalization layer just for them:

In [16]:
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = layers.Normalization()
norm.adapt(np.array(titanic[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

<KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'normalization_1')>

## tf.keras.layers.StringLookup
## tf.keras.layers.CategoryEncoding
The StringLookup layer automatically fits a dictionary on string data and then uses that to encode incoming data with numeric labels. If you don't know how dictionaries work in nlp then refer to those notes.<br>
The Category Encoding layer then converts numeric input into one-hot encoding, although it can also be used for multi-hot encoding.

In [17]:
all_inputs = []
all_inputs.append(all_numeric_inputs)
for name, input in inputs.items():
  if input.dtype == tf.float32:
    continue

  lookup = layers.StringLookup(vocabulary=np.unique(titanic_features[name]))
  one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())

  x = lookup(input)
  x = one_hot(x)
  all_inputs.append(x)


In [18]:
#the layers,Concatenate just takes in a list of tensors and returns one combined tensor
preprocessed_inputs_cat = layers.Concatenate()(all_inputs)
# defining a model with the functional API
titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

titanic_preprocessing.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
age (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
n_siblings_spouses (InputLayer) [(None, 1)]          0                                            
__________________________________________________________________________________________________
parch (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
fare (InputLayer)               [(None, 1)]          0                                            
______________________________________________________________________________________________

In [19]:
features_dict = {name:values[:1] for name, values in titanic_features_dict.items()}
print(features_dict)
titanic_preprocessing(features_dict)

{'sex': array(['male'], dtype=object), 'age': array([22.]), 'n_siblings_spouses': array([1]), 'parch': array([0]), 'fare': array([7.25]), 'class': array(['Third'], dtype=object), 'deck': array(['unknown'], dtype=object), 'embark_town': array(['Southampton'], dtype=object), 'alone': array(['n'], dtype=object)}


<tf.Tensor: shape=(1, 28), dtype=float32, numpy=
array([[-0.610415 ,  0.395198 , -0.4790527, -0.4974028,  0.       ,
         0.       ,  1.       ,  0.       ,  0.       ,  0.       ,
         1.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  1.       ,
         0.       ,  0.       ,  0.       ,  1.       ,  0.       ,
         0.       ,  1.       ,  0.       ]], dtype=float32)>

Below we actually put everything together and create this new model using both the Sequential and Functional API!

In [20]:
def titanic_model(preprocessing_head, inputs):
  body = tf.keras.Sequential([
    layers.Dense(64),
    layers.Dense(1)
  ])

  preprocessed_inputs = preprocessing_head(inputs)
  result = body(preprocessed_inputs)
  model = tf.keras.Model(inputs, result)

  model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.optimizers.Adam())
  return model

titanic_model = titanic_model(titanic_preprocessing, inputs)

In [21]:
titanic_model.fit(x=titanic_features_dict, y=titanic_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b881e92b100>