In [1]:
%matplotlib notebook

import numpy as np
import tensorflow as tf

# Tensors

## Construction

A tensor is a **Nd-array** like in numpy:

In [2]:
a = tf.constant([[1., 2.], [3., 4.]])
print(f"a = {a}")

b = tf.constant([5., 6.])
print(f"b = {b}")

a = [[1. 2.]
 [3. 4.]]
b = [5. 6.]


In [3]:
a.shape

TensorShape([2, 2])

In [4]:
a.ndim

2

Alternative tensor **construction** exist:

In [5]:
e1 = tf.constant(np.random.normal(0, 1, (3, 4)))
print(f"e1 = {e1}\n")

e2 = tf.ones((3, 4))
print(f"e2 = {e2}\n")

e3 = tf.zeros((3, 4))
print(f"e3 = {e3}\n")

e1 = [[ 1.33359685  0.13459115 -1.1691243   0.18322556]
 [ 0.45117772  1.00664955 -1.00284915  0.84536542]
 [-0.01814427 -0.34787968 -0.30603083  1.59084713]]

e2 = [[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]

e3 = [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]



Like in numpy, you can specify the **data type** (`int`, `float`, `double`, ...):

In [None]:
e1 = tf.ones((3, 4), dtype=tf.float32) / 3
print(f"float32: {e1[0, 0]}\n")

e2 = tf.ones((3, 4), dtype=tf.float64) / 3
print(f"float64: {e2[0, 0]}\n")

## Operations

Like in numpy, many operations are defined.

However, you must only use **Tensorflow functions** (no Numpy!) in order to benefit from the automatic differentiation framework.

In [None]:
a[0, 1]

In [None]:
a[0, 1].numpy()

In [None]:
# Broadcasting
c = a + b
print(f"c = {c}")

In [None]:
# Sub-tensor
d = c[0, :]
print(f"d = {d}")

In [None]:
# Other functions that return a new tensor
print(tf.reduce_max(c))
print(tf.sin(c))
print(tf.reduce_sum(c**2))

More operations are available in **tf.math** https://www.tensorflow.org/api_docs/python/tf/math

## To and from Numpy

You can easily get a `tf.Tensor` from a numpy array and vice-versa.

In [None]:
a = np.random.rand(3, 2)
b = tf.convert_to_tensor(a)
print(f"a = {a}")
print(f"b = {b}")

In [None]:
c = b.numpy()

print(f"a = {a}")
print(f"b = {b}")
print(f"c = {c}")

# Neural networks

A **feed-forward** neural network can be seen as a function $f_\theta: \mathbb{R}^N \to \mathbb{R}^M$ that is the composition of layers of two kinds:

- **affine layers** (also called "linear" layers).
  A typical exemple is $L(x) = A x + b$ with $A$ a matrix of appropriate size.
  We can also consider a convolution layer $C(x) = x \star k$ with $k$ the kernel.
- **non-linear layers**.
  Typical non-linear layers are called *activation layer* like the function $ReLU(x) = \max(0, x)$. 

For example, we could have:

$$f = L_k \circ ReLU \circ L_{k-1} \circ ReLU \circ \dots \circ ReLU \circ L_1$$

Here, if $L_i(x) = A_i x + b_i$, the **parameters** $\theta$ of the model are all the coefficients of $A_i$ and $b_i$.

**Note that:**
- using two **consecutive linear layers** is equivalent to one linear layer,
- using a non-linear layer as **last layer** is uncommon since it bounds the output values,
- we typically count only the linear layers, so that:
  - the identity function (inputs = outputs) is called a 0-layer neural network,
  - the function $f_\theta = L$ composed of only one linear layer is called a 1-layer neural network,
  - the function $f = L_k \circ ReLU \circ L_{k-1} \circ ReLU \circ \dots \circ ReLU \circ L_1$ is called a k-layers neural network.
- many **activation layers** exist with different properties, see http://cs231n.github.io/neural-networks-1/#actfun

## Layers

### Common layers

Tensorflow features many kinds of layers in the `tf.keras.layers` namespace (see https://www.tensorflow.org/api_docs/python/tf/keras/layers/).

They are implemented as classes and you can create custom layers by defining a class that inherits from `tf.keras.layers.Layer` (see https://www.tensorflow.org/tutorials/customization/custom_layers)

Some common layers are:
- linear (or **dense**) layer in [`tf.keras.layers.Dense`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense) that model the $Ax + b$ function with $A \in \mathcal{M}_{M, N}(\mathbb{R})$ a dense matrix and $b \in \mathbb{R}^M$ the optional bias.
- activation layers in https://www.tensorflow.org/api_docs/python/tf/keras/activations like 
  [`tf.keras.activations.relu`](https://www.tensorflow.org/api_docs/python/tf/keras/activations/relu), 
  [`tf.keras.activations.elu`](https://www.tensorflow.org/api_docs/python/tf/keras/activations/elu), 
  [`tf.keras.activations.tanh`](https://www.tensorflow.org/api_docs/python/tf/keras/activations/tanh),
  [`tf.keras.activations.sigmoid`](https://www.tensorflow.org/api_docs/python/tf/keras/activations/sigmoid),...

In [None]:
# A very simple model (a perceptron)
layer = tf.keras.layers.Dense(2, input_shape=(None, 10))

In [None]:
# To apply it, simply call it
layer(tf.zeros((1, 10)))

In [None]:
# Displaying its parameters (matrix and bias)
print(layer.kernel)
print(layer.bias)

### Sequence of layers

When your model is only a sequence of layers, you can simply use a `tf.keras.Sequential` module with all layers as parameters.

In [None]:
# A model with two dense layer, and a ReLU activation after the first one
N, M = 10, 2
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(N // 2, input_dim=N, activation='relu'))
model.add(tf.keras.layers.Dense(M, activation='linear'))

model.summary()
print()

for i, layer in enumerate(model.layers):
    print(f"Parameters for layer {i}")
    print(layer.kernel)
    print(layer.bias)
    print()

In [None]:
# Applying the model on a 3 random inputs
x = tf.random.normal((3, N))
y = model(x)
print(y)

### Custom activation function

You can create custom activation function. They are simply tensorflow functions that take tensor as inputs and return a tensor of same shape and types.

In [None]:
# Gaussian-shaped activation function
def gaussian_activation(t):
    return tf.math.exp(-t**2)

In [None]:
model2 = tf.keras.Sequential()
model2.add(tf.keras.layers.Dense(N // 2, input_dim=N, activation=gaussian_activation))
model2.add(tf.keras.layers.Dense(M, activation='linear'))

# Applying the model on the same 3 inputs
y = model2(x)
print(y)

## Loss functions

Some loss functions are available in `tf.keras.losses` (see https://www.tensorflow.org/api_docs/python/tf/keras/losses):
- [`tf.keras.losses.MeanSquareError`](https://www.tensorflow.org/api_docs/python/tf/keras/losses/MeanSquaredError) for the l2-norm,
- [`tf.keras.losses.SparseCategoricalCrossentropy`](https://www.tensorflow.org/api_docs/python/tf/keras/losses/SparseCategoricalCrossentropy) for a loss function adapted to classification problems.

Given a model (a neural network) that depends on some parameters, we want to find values of these parameters such that the result of the model apply to some data are close to what we want.
This "distance" is given by the loss function. We want to minimize the loss function with respect to the parameters of the model.

# Optimization

Tensorflow features many optimization algorithms in the `tf.optimizers` module (see the [documentation](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers)).

All these optimizers have a common interface:
1. first, define your model and its **parameters**:

In [None]:
import tensorflow as tf

# N Rosenbrock functions
N = 1000

# The two parameters for each Rosenbrock function
a = 0.1 * tf.random.normal((N,), dtype=tf.float64) + 1
b = 1 * tf.random.normal((N,), dtype=tf.float64) + 10

# This is the minimum of the sum of the Rosenbrock functions
ref_min_x = tf.math.reduce_mean(a)
ref_min_y = ref_min_x**2

# The Rosenbrock function
def f(a, b, x, y):
    return (x - a)**2 + b * (x**2 - y)**2


# Loss function, the one we want to minimize
# Mean of Rosenbrock functions
def loss(x, y):
    return tf.math.reduce_mean(f(a, b, x, y), axis=-1)


# The variables to update during the minimization of the loss function.
# They are initialized with the start point for the minimization
x = tf.Variable(0.1, dtype=tf.float64)
y = tf.Variable(2.5, dtype=tf.float64)

print(f"loss({x.numpy()}, {y.numpy()}) = {loss(x.numpy(), y.numpy())}")

2. Create the optimizer and set some hyper-parameters like the learning rate.

In [None]:
optimizer = tf.optimizers.SGD(learning_rate=0.001)

3. Minimize the loss by calling the function `minimize` of the optimizer. This function performs only one step of the optimizer, you need to call it multiple time to find the minimum. Input arguments are:

- a loss function that takes no arguments.
- the variables to update during minimization. The gradient of the loss function will be computed with respect to these variables.

In [None]:
nstep = 1000
for _ in range(nstep):
    step_count = optimizer.minimize(lambda : loss(x, y), [x, y])

In [None]:
print(f"Number of steps = {step_count.numpy()}")
print(f"x = {x.numpy()}")
print(f"y = {y.numpy()}")
print(f"loss({x.numpy()}, {y.numpy()}) = {loss(x.numpy(), y.numpy())}")

print(f"min_x = {ref_min_x}")
print(f"min_y = {ref_min_y}")

In [None]:
import numpy as np

# Optimizers list and parameters
optimizers = [
    {'name': 'Adadelta', 'class': tf.optimizers.Adadelta, 'args': dict(learning_rate=1e-1, rho=0.95)},
    {'name': 'Adagrad', 'class': tf.optimizers.Adagrad, 'args': dict(learning_rate=0.5)},
    {'name': 'Adam', 'class': tf.optimizers.Adam, 'args': dict(learning_rate=0.1, beta_1=0.9, beta_2=0.999)},
    {'name': 'Adamax', 'class': tf.optimizers.Adamax, 'args': dict(learning_rate=0.1, beta_1=0.9, beta_2=0.999)},
    {'name': 'SGD', 'class': tf.optimizers.SGD, 'args': dict(learning_rate=0.0005)},
]

# The start point for the minimization
start_point = tf.constant([0.1, 2.5], dtype=tf.float64)

# nstep for each algorithms (easier to compare)
nstep = 300

# Iteration points for each optimizers
points = np.empty((len(optimizers), nstep, 2))

for config_id, config in enumerate(optimizers):
    # The start point for the minimization
    pt = tf.Variable(start_point)
    points[config_id, 0, :] = pt.numpy()
    
    # Optimizer
    optimizer = config['class'](**config['args'])

    # Optimizing
    for i in range(1, nstep):
        optimizer.minimize(lambda: loss(pt[0], pt[1]), [pt])
        points[config_id, i, :] = pt.numpy()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from matplotlib import cm

# Function to play the animation of the results
def plot_method(optimizers, points):
    fig = plt.figure(figsize=(9, 6))
    plt.contour(X, Y, Z, levels=levels, cmap=cm.coolwarm, vmin=np.amin(Z), vmax=np.amax(Z), alpha=0.5)
    plt.plot(start_point[0], start_point[1], "rX")
    plt.plot(ref_min_x, ref_min_y, "g*")
    plt.xlabel('x')
    plt.ylabel('y')
    title = plt.title('it = 0')
    
    plots = []
    for config_id, config in enumerate(optimizers):
        plots.append(plt.plot(points[config_id, 0, 0], points[config_id, 0, 1], '-D', markevery=[-1], label=config['name'])[0])
    plt.legend()
    
    def update(it):
        for config_id in range(len(optimizers)):
            plots[config_id].set_data(points[config_id, :it+1, 0], points[config_id, :it+1, 1])
            plots[config_id].set_markevery([it])
        title.set_text(f"it = {it}")
            
        return plots
    
    return FuncAnimation(fig, update, frames=nstep, interval=10)

# Make data.
X = np.arange(-2, 2, 0.1)
Y = np.arange(-1, 3, 0.1)
X, Y = np.meshgrid(X, Y)
Z = loss(X[:, :, None], Y[:, :, None])
levels = [1e-5, 1e-1, 1, 10, 50, 100, 200]

anim = plot_method(optimizers, points)

## Datasets

Tensorflow provides datasets to feed your neural network with data. They are generators, like the `range` function, and many operations can be apply easily.

### Dataset creation

The simplest way to create a dataset is from a tensor, numpy array or python list

In [None]:
# From a numpy array
npdata = np.arange(10)
dataset = tf.data.Dataset.from_tensor_slices(npdata)

In [None]:
# You can then iterate over the dataset
for data in dataset:
    print(data)

If your data doesn't fit in memory, you can write them in `.tfrecord` files and create a dataset from the files with the function `tf.data.TFRecordDataset`. We will not look into it here, but when dealing with a huge amount of data, this is the only solution.

In [None]:
# For training, you will need training data and the exact results (labels) that your neural network will approximate
# This can be done by passing a tuple of the two arrays.
training_data = np.arange(10)
labels = training_data**2

dataset = tf.data.Dataset.from_tensor_slices((training_data, labels))

for data, label in dataset:
    print(data, " -- ", label)

### Transformation and preparation of a dataset for training

Many operation can be apply on a dataset before using it in training phase. One can apply transformations on the data, cache them (store them in memory for faster access), batch them, prefetch them and shuffle them.

In [None]:
# Applying a transformation : use the map function of dataset
npdata = np.arange(10)
dataset = tf.data.Dataset.from_tensor_slices(npdata)

dataset = dataset.map(lambda x: x**2, num_parallel_calls=tf.data.AUTOTUNE)
for data in dataset:
    print(data)

In [None]:
# take only the first n elements
for data in dataset.take(3):
    print(data)

In [None]:
# caching data for faster access is done with the cache function
# The difference is visible during training phase when we iterate many times over the dataset
dataset = dataset.cache()

In [None]:
# shuffling : data in dataset are shuffled. The only argument is the number of data in the shuffle buffer.
for data in dataset.shuffle(2):
    print(data)

In [None]:
# This dataset fills a random buffer with 2 elements and randomly pick one. 
# The taken data is then replace with another element.
# For perfect shuffling of all the data in the dataset, a buffer size greater or equal to the number of data is
# mandatory.
for data in dataset.shuffle(10):
    print(data)

In [None]:
# One can batch samples from the dataset to take more than one at once
# Here we want the data by batch of 2
for data in dataset.shuffle(10).batch(2):
    print(data)

In [None]:
# During training phase, for faster use of dataset, you can prefetch the data.
# It means that every operations that come before the prefetch are perform at the same time as the training
# Here we prefetch 2 shuffled batches of size 2 while printing the previous two batches
for data in dataset.shuffle(10).batch(2).prefetch(2):
    print(data)