Rafał Nowak

<img style="float: left" src="http://rno.ii.uni.wroc.pl/uploads/cats-dogs/img/tplx-academy_rgb_transparent.png" width="450"><br/>
<font size="+4">Neural Networks in Tensorflow 2.x<br/>but with some math details</font><br/>

In [60]:
# %tensorflow_version 2.x - switch only for Google Colab notebook
%tensorflow_version 2.x

import tensorflow as tf
import numpy as np
print(tf.__version__)

2.1.0


# Tensorflow basics

## List available devices

In [2]:
tf.config.list_physical_devices('CPU')

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Constants and variables in tensorflow

### Scalars

In [6]:
tf.constant( 42 ),   tf.constant( 42.0 ),  tf.constant( 42.0, dtype=tf.float16 ),  tf.constant( 42.0, dtype=tf.float32 ),  tf.constant( 42.0, dtype=tf.float64 )

(<tf.Tensor: shape=(), dtype=int32, numpy=42>,
 <tf.Tensor: shape=(), dtype=float32, numpy=42.0>,
 <tf.Tensor: shape=(), dtype=float16, numpy=42.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=42.0>,
 <tf.Tensor: shape=(), dtype=float64, numpy=42.0>)

### `tf.device`

In [7]:
a = tf.constant(5)
a.device

'/job:localhost/replica:0/task:0/device:CPU:0'

In [9]:
# b = a.gpu()  # <--- deprecated :-(

with tf.device('/gpu:0'):
    b = tf.identity(a)

b.device

'/job:localhost/replica:0/task:0/device:GPU:0'

In [11]:
c = a+b
c, c.numpy(),  c.device

(<tf.Tensor: shape=(), dtype=int32, numpy=10>,
 10,
 '/job:localhost/replica:0/task:0/device:GPU:0')

### Vectors
<img align="right" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/vector.jpeg" width="450">

In [0]:
x = tf.constant([2.0, 5.0, 1.0, 3.0, 4.0], dtype=float)

print( x )
print( x.shape )
print( x.device )

tf.Tensor([2. 5. 1. 3. 4.], shape=(5,), dtype=float32)
(5,)
/job:localhost/replica:0/task:0/device:CPU:0


In [12]:
x = tf.Variable([2.0, 5.0, 1.0, 3.0, 4.0], dtype=float)

print( x )
print( x.shape )

<tf.Variable 'Variable:0' shape=(5,) dtype=float32, numpy=array([2., 5., 1., 3., 4.], dtype=float32)>
(5,)


### Dot product
<img align="right" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/dot_product.gif" width="450">

<img align="right" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/dot_product_example.png" width="250">

* `tf.tensordot(a, b, axes=1)`; see <a href="https://www.tensorflow.org/api_docs/python/tf/tensordot">tensordot</a>

In [19]:
a = tf.constant([ 1, 4, -2])
b = tf.constant([-2, 1,  7])

tf.tensordot( a, b, axes=1 )

<tf.Tensor: shape=(), dtype=int32, numpy=-12>

### Matrices
<img align="right" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/matrix_dot_vector.png" width="650">

In [21]:
A = tf.constant( [ 
                 [2.0, 3.0],
                 [1.0, 1.0],
                 [3.0, 0.0],
                 [4.0, 1.0] ] )

B = tf.constant( [ 
                 [1.0, 2.0, 3.0],
                 [1.0, 0.0, 1.0] ] )
A, B

(<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
 array([[2., 3.],
        [1., 1.],
        [3., 0.],
        [4., 1.]], dtype=float32)>,
 <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
 array([[1., 2., 3.],
        [1., 0., 1.]], dtype=float32)>)

In [0]:
A.device

'/job:localhost/replica:0/task:0/device:CPU:0'

See <a href="https://www.tensorflow.org/api_docs/python/tf/linalg/matmul">`tf.matmul`</a> documentation.

In [0]:
tf.matmul(A, B)

<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[ 5.,  4.,  9.],
       [ 2.,  2.,  4.],
       [ 3.,  6.,  9.],
       [ 5.,  8., 13.]], dtype=float32)>

In [25]:
tf.tensordot(A, B, axes=1).shape

TensorShape([4, 3])

# Dataset representation

* each **sample** is described by $n$ **features** (vector with $n$ numbers)
* $m$ samples are represented by matrix with size $m \times n$

<img align="left" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/matrix_m_n.png" width="250">

## Example - diabetes dataset (442x10)

## Tensor representation

In [27]:
# Diabetes dataset (442 people descirbed by 10 features:
# age, sex, BMI, avg blood pressure, s1, s2, ..., s6)
import sklearn
from sklearn import datasets as sklearn_datasets

X_numpy = sklearn_datasets.load_diabetes().data
X_numpy

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

### `tf.convert_to_tensor`

In [33]:
X = tf.convert_to_tensor(X_numpy,  dtype=tf.float32)
print(X)
print(X.shape)

tf.Tensor(
[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764612]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.0852989   0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.01107952 -0.04687948
   0.01549073]
 [-0.04547248 -0.04464164  0.03906215 ...  0.02655962  0.04452837
  -0.02593034]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.03949338 -0.00421986
   0.00306441]], shape=(442, 10), dtype=float32)
(442, 10)


In [35]:
# First 3 people
print(X[:3])

tf.Tensor(
[[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990842 -0.01764612]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632784 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06832974 -0.09220405]
 [ 0.0852989   0.05068012  0.04445121 -0.00567061 -0.04559945 -0.03419447
  -0.03235593 -0.00259226  0.00286377 -0.02593034]], shape=(3, 10), dtype=float32)


## `tf.data.Dataset`

In [36]:
dataset = tf.data.Dataset.from_tensor_slices(X_numpy)
dataset

<TensorSliceDataset shapes: (10,), types: tf.float64>

In [37]:
iterator = dataset.as_numpy_iterator()
print(next(iterator))

[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
 -0.04340085 -0.00259226  0.01990842 -0.01764613]


In [38]:
X_numpy[0]

array([ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
       -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613])

# Perceptron - single neuron unit

Simple linear model with $n+1$ parameters (weights):
* $w_1, w_2, \ldots, w_n$ - called by _kernel_, _weights_,
* $b$ - usually called by _bias_ or _intercept_.

<img align="right" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/perceptron.png" width="500">

It performs the following matrix-vector operation:
$$ \mathtt{logits} = Xw+b = w_1 x_1 + w_2 x_2 + \cdots + w_n x_n + b. $$

### Step 1 - Weights initialization

**TODO**

Initialize somehow parameters of the model.

Lets try to start with<br/>
$ w = [ 0.1, 0.2, 0.3, 0.1, 0.0, 0.7, 0.8, 0.9, 1.0, 0.5 ] $<br/>
$ b = 0.66 $


In [0]:
# TODO: initialize somehow parameters w, b
w = tf.Variable([ 0.1, 0.2, 0.3, 0.1, 0.0, 0.7, 0.8, 0.9, 1.0, 0.5 ], dtype=tf.float32)
b = tf.Variable(0.66, dtype=tf.float32)

### Step 2 - Forward propagation

**TODO**:
- perform matrix-vector operation $$ \mathtt{logits} = Xw + b $$

In [53]:
# TODO
logits = tf.tensordot(X, w, axes=1) + b
print(logits.shape)

(442,)


In [54]:
logits.device

'/job:localhost/replica:0/task:0/device:GPU:0'

# Toy dataset - MNIST digits (1797 x 64)
<img align="left" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/digits_8x8.png" width="450">

* 1797 images (handwritten digits 0, 1, ..., 9) represented by 8x8 pixels (64 features)

In [56]:
dataset = sklearn.datasets.load_digits()
X = tf.convert_to_tensor(dataset.data, dtype=float)
target = tf.convert_to_tensor(dataset.target)

print(X.shape)
print(target.shape)

(1797, 64)
(1797,)


In [57]:
# some two images
print(X[60:62])
print(target[60:62])

tf.Tensor(
[[ 0.  0. 10. 15. 14.  4.  0.  0.  0.  0.  4.  6. 13. 16.  2.  0.  0.  0.
   0.  3. 16.  9.  0.  0.  0.  0.  0.  1. 16.  6.  0.  0.  0.  0.  0.  0.
  10. 12.  0.  0.  0.  0.  0.  0.  1. 16.  4.  0.  0.  1.  9.  5.  6. 16.
   7.  0.  0.  0. 14. 12. 15. 11.  2.  0.]
 [ 0.  0.  6. 13. 16.  6.  0.  0.  0.  3. 16. 14. 15. 16.  1.  0.  0.  0.
   5.  0.  8. 16.  2.  0.  0.  0.  0.  0.  8. 16.  3.  0.  0.  3. 15. 16.
  16. 16.  9.  0.  0.  5. 13. 14. 16. 11.  3.  0.  0.  0.  0. 12. 15.  1.
   0.  0.  0.  0.  4. 16.  7.  0.  0.  0.]], shape=(2, 64), dtype=float32)
tf.Tensor([3 7], shape=(2,), dtype=int64)


## Weights initialization

**TODO**

Initialize **randomly** the model parameters.

In [0]:
tf.random.set_seed(1234)

# TODO: initialize random (with normal distribution) parameters w, b
w = tf.Variable(tf.random.normal((64,)))
b = tf.Variable(tf.random.normal(()))

In [65]:
w[:5]

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([ 0.8369314 , -0.73429775,  1.0402943 ,  0.04035992, -0.72186583],
      dtype=float32)>

## Forward propagation

**TODO**:
- perform matrix-vector operation $$ \mathtt{logits} = Xw + b $$

In [66]:
# TODO
logits = tf.tensordot(X, w, axes=1) + b
print(logits.shape)
print(logits[:3])

(1797,)
tf.Tensor([ -4.024562 -18.443037 -20.574102], shape=(3,), dtype=float32)


## Multi-class case
<img style="float: left" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/nn.png" width="250">

In more general case:
* dataset $X \in \mathbb{R}^{m\times n}$ --- $m$ samples, $n$ features,
* $K$ classes,
we have parameters $W\in\mathbb{R}^{n\times K}$ and $b\in\mathbb{R}^{K}$.

**Forward propagation**
$$ \mathtt{logits} = XW+b \in \mathbb{R}^{m\times K} $$

In our toy dataset we have 10 classes (10 digits).
We need 10 perceptrons.

In [67]:
tf.random.set_seed(1234)
# TODO: inilize paramters for 10 perceptrons
W = tf.Variable(tf.random.normal((64, 10)))
b = tf.Variable(tf.random.normal((10,)))

# Forward propagation should give 10 numbers for each sample from dataset
logits = tf.matmul(X, W) + b
print(logits.shape)

(1797, 10)


In [68]:
logits[:3]

<tf.Tensor: shape=(3, 10), dtype=float32, numpy=
array([[  28.0057   ,  -10.317568 ,    4.1859493,   41.452213 ,
          87.12719  ,   12.526066 ,  -32.508133 ,   14.999803 ,
          50.011566 ,  -46.345615 ],
       [  65.52083  ,   68.911575 ,    3.0044184,   27.64514  ,
         120.454605 ,  -18.31042  ,   41.88183  ,  -60.928543 ,
         -53.047035 ,  -75.92069  ],
       [ 138.8532   ,   21.11906  ,    9.45552  ,   27.635242 ,
         126.546005 ,   64.14821  ,   51.996555 ,  -12.506589 ,
          80.02577  , -110.63597  ]], dtype=float32)>

# Loss function

## Softmax function
<img style="float: left" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/softmax.png" width="850">

Lets consider the function `softmax`, which for given vector $\mathtt{logits} = z\in\mathbb R^K$  returns the vector $\hat y\in \mathbb R^K$ such that

$$ \hat y_j = \frac{\text{exp}(z_j)}{\sum_{k=1}^{K} \text{exp}(z_k)}. $$

```python
def softmax(z):
    z -= np.max(z,axis=1,keepdims=True)
    z  = np.exp(z)
    z /= np.sum(z,axis=1,keepdims=True)
    return z
```

**Exercise**: compute softmax function for logits<br/>
Hint: see <a href="https://www.tensorflow.org/api_docs/python/tf/nn/softmax">`tf.nn.softmax`</a>

In [72]:
# Compute the softmax(logits)
y_hat = tf.matmul(X, W) + b
y_hat = tf.nn.softmax(y_hat)
print('y_hat.shape = ', y_hat.shape)

y_hat.shape =  (1797, 10)


In [73]:
for i in range(45, 51):
  print("y_hat[{:d}]=[".format(i), ", ".join(["{:.4f}".format(v) for v in y_hat[i]]), "]")

y_hat[45]=[ 0.0000, 0.0000, 0.0000, 0.0000, 0.1314, 0.0000, 0.0000, 0.0000, 0.8686, 0.0000 ]
y_hat[46]=[ 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000 ]
y_hat[47]=[ 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000 ]
y_hat[48]=[ 0.9991, 0.0000, 0.0000, 0.0000, 0.0009, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000 ]
y_hat[49]=[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000 ]
y_hat[50]=[ 0.8167, 0.0000, 0.0000, 0.0000, 0.1833, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000 ]


## Cross-entropy loss functions

Assume that:

Example $x^{(i)}$ (of target class $k_i$) -- (forward propagation) --> $\hat y^{(i)}$

We apply the following loss for such example
$$
\mathrm{loss} = - \log \hat y^{(i)}_{k_i}
$$

For the whole training dataset we calculate the mean loss
$$
\begin{aligned}
J(W,b) = - \frac{1}{m} \sum_{i \in m} \log\hat{y}_{\mathtt{target}[i]}^{(i)} \in \mathbb{R}
\end{aligned}
$$

**Exercise**: compute cross-entropy loss for whole training dataset<br/>
Hint: use <a href="https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits">`tf.nn.softmax_cross_entropy_with_logits`</a> instead of <a href="https://www.tensorflow.org/api_docs/python/tf/nn/softmax">`tf.nn.softmax`</a>; see also:
* <a href="https://www.tensorflow.org/api_docs/python/tf/one_hot">`tf.one_hot`</a>
* <a href="https://www.tensorflow.org/api_docs/python/tf/math/reduce_mean">`tf.math.reduce_mean`</a>

In [0]:
# TODO: one_hot
oh_target = tf.one_hot(target, depth=10)

In [0]:
# softmax cross-entropy
sce = tf.nn.softmax_cross_entropy_with_logits(oh_target, logits)

In [82]:
# reduce_mean
rm = tf.reduce_mean(sce)
rm

<tf.Tensor: shape=(), dtype=float32, numpy=84.14354>

**Exercise**: implement `loss` function

In [88]:
# Now it is all about good parameters :-)

# TODO: try to find the seed such that random parameters give better loss, 
# i.e. less than 62

def loss(X, target, W, b):
    logits = tf.matmul(X, W) + b

    oh_target = tf.one_hot(target, depth=10)
    sce = tf.nn.softmax_cross_entropy_with_logits(oh_target, logits)
    return tf.reduce_mean(sce)

tf.random.set_seed( 99 )  # TODO <-- put your seed here

# TODO: inilize paramters for 10 perceptrons
W = tf.Variable(tf.random.normal((64, 10)))
b = tf.Variable(tf.random.normal((10, )))

loss(X, target, W, b)

<tf.Tensor: shape=(), dtype=float32, numpy=57.797626>

In [86]:
W.shape, b.shape

(TensorShape([64, 10]), TensorShape([10]))

# Optimization with gradient descent
<img align="left" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/gradient_descent.png" width="450">
<img align="left" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/gradient_descent_formula.jpg" width="450">

We need to compute the gradients
$$ \frac{\partial}{\partial W}J(W,b), \qquad  \frac{\partial}{\partial b}J(W,b) $$

In [0]:
with tf.GradientTape() as gradient_tape:
    L = loss(X, target, W, b)
dW, db = gradient_tape.gradient(L, [W, b])

In [90]:
W.shape, dW.shape

(TensorShape([64, 10]), TensorShape([64, 10]))

In [95]:
dW.shape, db.shape

(TensorShape([64, 10]), TensorShape([10]))

# Backward propagation

On get the gradient $\frac{\partial}{\partial W}J$ by using the chain rule 
$$ \frac{\partial J}{\partial W} = \frac{\partial \texttt{logits}}{\partial W} \frac{\partial J}{\partial \texttt{logits}}.  $$

First term is simple since `logits` is linear function of $W$:
$$ \frac{\partial \texttt{logits}}{\partial W} = X^T \in \mathbb{R}^{n\times m} $$

The second term, can be find after some calculus. Finally one can check that
$$\frac{\partial J}{\partial\texttt{logits}} = (\hat y - \mathtt{target\_{}one\_{}hot}
 ) / m =: \delta, $$
where `target_one_hot` is one-hot encoding of `target`.

Hence we get:
$$ \frac{\partial J}{\partial W} = \frac{\partial \texttt{logits}}{\partial W} \frac{\partial J}{\partial \texttt{logits}} = X^T \delta  $$

The gadient $\frac{\partial J}{\partial b}$ can be computed in the same way,
but one should use $\mathbb{1}^T$ (vector with ones) instead of $X^T$.

```python
# Step 1. Compute delta
delta = ( y_hat - to_categorical(target))/len(y_hat)
# TODO: Compute ∂J / ∂W
dW = X.T.dot(delta)
print( 'dW.shape = ', dW.shape )
# TODO: Compute ∂J / ∂b
db = np.sum(delta, axis=0)
print( 'db.shape = ', db.shape )
```

# MLP
<img align="left" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/mlp.png" width="750">

In [0]:
# TODO: Initilize parameters
tf.random.set_seed(12345)
# first hidden layers (100 neurons)
# second hidden layers (50 neurons)
# output layer (10 neurons)

W1 = tf.Variable(tf.random.normal((64, 100)))
b1 = tf.Variable(tf.random.normal((100, )))

W2 = tf.Variable(tf.random.normal((100, 50)))
b2 = tf.Variable(tf.random.normal((50, )))

W3 = tf.Variable(tf.random.normal((50, 10)))
b3 = tf.Variable(tf.random.normal((10, )))

## Forward propagation

In [0]:
A0 = X # input

Z1 = tf.matmul(X, W1) + b1
A1 = tf.tanh(Z1) # tanh

Z2 = tf.matmul(A1, W2) + b2
A2 = tf.tanh(Z2) # tanh

Z3 = tf.matmul(A2, W3) + b3 # logits
A3 = tf.nn.softmax(Z3) # softmax

y_hat = A3 # output

In [101]:
print('A0.shape (input ): ',  A0.shape)
print('A1.shape (hidden): ',  A1.shape)
print('A2.shape (hidden): ',  A2.shape)
print('A3.shape (output): ',  A3.shape)

A0.shape (input ):  (1797, 64)
A1.shape (hidden):  (1797, 100)
A2.shape (hidden):  (1797, 50)
A3.shape (output):  (1797, 10)


In [0]:
# TODO
def loss(X,target, W1,b1,W2,b2,W3,b3):
  A0 = X # input

  Z1 = tf.matmul(X, W1) + b1
  A1 = tf.tanh(Z1) # tanh

  Z2 = tf.matmul(A1, W2) + b2
  A2 = tf.tanh(Z2) # tanh

  Z3 = tf.matmul(A2, W3) + b3 # logits
  A3 = tf.nn.softmax(Z3) # softmax

  y_hat = A3 # output

  oh_target = tf.one_hot(target, depth=10)
  sce = tf.nn.softmax_cross_entropy_with_logits(oh_target, Z3)
  
  return tf.reduce_mean(sce)

## Loss

In [103]:
with tf.GradientTape() as gradient_tape:
    L = loss(X, target, W1, b1, W2, b2, W3, b3)
L

<tf.Tensor: shape=(), dtype=float32, numpy=10.479221>

## Backward propagation

In [0]:
dW1, db1, dW2, db2, dW3, db3 = gradient_tape.gradient(L, [W1,b1,W2,b2,W3,b3])

In [105]:
dW1.shape

TensorShape([64, 100])

# Single step of GD

In [106]:
alpha = tf.constant(0.1, dtype=float)
W1.assign_sub( alpha * dW1 )
W2.assign_sub( alpha * dW2 )
W3.assign_sub( alpha * dW3 )

b1.assign_sub( alpha * db1 )
b2.assign_sub( alpha * db2 )
b3.assign_sub( alpha * db3 )

<tf.Variable 'UnreadVariable' shape=(10,) dtype=float32, numpy=
array([ 0.8738941 , -0.66149586, -0.3649803 ,  0.01973989, -1.130679  ,
       -1.4375238 , -0.37911487,  0.40206444,  1.2475984 , -0.7922754 ],
      dtype=float32)>

In [0]:
loss(X, target, W1, b1, W2, b2, W3, b3)

<tf.Tensor: shape=(), dtype=float32, numpy=9.290512>

Next step

In [107]:
with tf.GradientTape() as gradient_tape:
    L = loss(X, target, W1, b1, W2, b2, W3, b3)
dW1, db1, dW2, db2, dW3, db3 = gradient_tape.gradient(L, [W1,b1,W2,b2,W3,b3])
W1.assign_sub( alpha * dW1 )
W2.assign_sub( alpha * dW2 )
W3.assign_sub( alpha * dW3 )
b1.assign_sub( alpha * db1 )
b2.assign_sub( alpha * db2 )
b3.assign_sub( alpha * db3 )
loss(X, target, W1, b1, W2, b2, W3, b3)

<tf.Tensor: shape=(), dtype=float32, numpy=8.389771>

# Final training loop

In [108]:
epochs = 10

for i in range(epochs):
    with tf.GradientTape() as gradient_tape:
        L = loss(X, target, W1, b1, W2, b2, W3, b3)
    dW1, db1, dW2, db2, dW3, db3 = gradient_tape.gradient(L, [W1,b1,W2,b2,W3,b3])
    W1.assign_sub( alpha * dW1 )
    W2.assign_sub( alpha * dW2 )
    W3.assign_sub( alpha * dW3 )
    b1.assign_sub( alpha * db1 )
    b2.assign_sub( alpha * db2 )
    b3.assign_sub( alpha * db3 )

    print("Step {:3d}/{:3d} : loss = {:.5f}".format(i+1, epochs, L))

Step   1/ 10 : loss = 8.38977
Step   2/ 10 : loss = 7.66874
Step   3/ 10 : loss = 6.98112
Step   4/ 10 : loss = 6.54724
Step   5/ 10 : loss = 6.16826
Step   6/ 10 : loss = 5.88399
Step   7/ 10 : loss = 5.53995
Step   8/ 10 : loss = 5.27516
Step   9/ 10 : loss = 5.05868
Step  10/ 10 : loss = 4.80365


# Homework

Train your MLP and recognize the Fashion MNIST dataset.

_Remark_: in the case of RGB images one can flatten all the channels
<img align="left" src="http://rno.ii.uni.wroc.pl/uploads/tooploox/image_to_vector.png" width="750">

In [0]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [0]:
X_train.shape

(60000, 28, 28)

In [0]:
X_train = X_train.reshape((-1, 28*28))
X_train.shape

(60000, 784)