```
Rafał Nowak

Warsztat Deep Learning

9 kwietnia 2020
```

In [1]:
import urllib3
urllib3.disable_warnings()

import tensorflow as tf
import tensorflow_datasets as tfds

import numpy as np
print(tf.__version__)

2.1.0


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

In [3]:
!nvidia-smi

Thu Apr  9 19:19:43 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 435.21       Driver Version: 435.21       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 105...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   56C    P0    N/A /  N/A |    522MiB /  4042MiB |     14%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

# Dataset

## CIFAR10 From `tensorflow_datasets`

In [4]:
###########
# Option 1
# dataset = tfds.load('cifar10', as_supervised=True)
# ds_train, ds_test = dataset['train'], dataset['test']
###########


###########
# Option 2
dataset = tfds.image.cifar.Cifar10()

# Download dataset
dataset.download_and_prepare()

ds = dataset.as_dataset(as_supervised=True)
ds_train, ds_test = ds['train'], ds['test']

## Preprocess data

In [5]:
# Preprocess dataset (scale to [0, 1) and convert to float)

def preprocess_fn(image, label):
    return tf.image.convert_image_dtype(image, dtype=tf.float32), label

ds_train = ds_train.map(preprocess_fn)
ds_test  = ds_test.map(preprocess_fn)

# MLP models

## model1 = MLP from tf.keras.models.Sequential with softmax activation

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense

model1 = Sequential([
    Flatten(input_shape=(32, 32, 3)),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

### Few experiments

Lets take some batch from training data

In [7]:
x_batch, y_batch = next( ds_train.batch(16).take(1).__iter__() )

print(f"The shape of x_batch is {x_batch.shape}")
print(f"The shape of y_batch is {y_batch.shape}")

The shape of x_batch is (16, 32, 32, 3)
The shape of y_batch is (16,)


We have two tensors `x_batch` and `y_batch`

In [8]:
type(x_batch), type(y_batch)

(tensorflow.python.framework.ops.EagerTensor,
 tensorflow.python.framework.ops.EagerTensor)

In [9]:
y_batch.numpy()

array([7, 8, 4, 4, 6, 5, 2, 9, 6, 6, 9, 9, 3, 0, 8, 7])

Lets feedforward the model with `x_batch`

In [10]:
y_pred = model1(x_batch) # Tensor
y_pred.shape

TensorShape([16, 10])

Since we used `softmax` at the end our predictions are actually *probaibilites* (not logits).

In [11]:
y_pred1 = model1.predict(x_batch)
y_pred2 = model1.predict_proba(x_batch)

assert np.all( np.equal( y_pred1, y_pred2 ) )

Once also ask for the classes 

In [12]:
model1.predict_classes(x_batch)

array([9, 7, 7, 7, 9, 9, 7, 7, 9, 7, 9, 9, 7, 7, 9, 9])

One can try to compare this with labels from ds_train

In [13]:
labels = next( ds_train.batch(16).take(1).map(lambda _, label: label).__iter__() )
labels

<tf.Tensor: shape=(16,), dtype=int64, numpy=array([7, 8, 4, 4, 6, 5, 2, 9, 6, 6, 9, 9, 3, 0, 8, 7])>

In [14]:
# Option 1 - using numpy
np.mean( labels.numpy() == model1.predict_classes(x_batch) )

0.125

In [15]:
# Option 2 - using only tensorflow

# Info: this is probably too complicated :-)
tf.reduce_mean(
    tf.cast(
        tf.equal(
            tf.argmax( model1(x_batch), axis=1 ), 
            labels
        ),
        dtype=tf.float32
    )
)

<tf.Tensor: shape=(), dtype=float32, numpy=0.125>

## model2 = model1 without softmax

In [16]:
model2 = Sequential([
    Flatten(input_shape=(32, 32, 3)),
    Dense(128, activation='relu'),
    Dense(10) # no more softmax activation
])

### Few more experiments

In [17]:
y_pred = model2(x_batch) # Tensor
y_pred.shape

TensorShape([16, 10])

Since we didn't use `softmax` at the end our predictions are actually **logits**.

In [18]:
y_pred1 = model2.predict(x_batch)

In [19]:
# Warning: one should not use predict_proba now
y_pred2 = model2.predict_proba(x_batch)





In [20]:
# but the assertion passes
assert np.all( np.equal( y_pred1, y_pred2 ) )

Once also ask for the classes 

In [21]:
model2.predict_classes(x_batch)

array([6, 4, 6, 6, 4, 6, 5, 6, 6, 6, 6, 6, 4, 5, 6, 6])

In [22]:
np.mean( labels.numpy() == model2.predict_classes(x_batch) )

0.125

# Compile the models

It is important to use `from_logits=False` (`True`) with `model1` (`model2`)

In [23]:
model1.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [24]:
model2.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Training

## Train models for 5 epochs

Lets use `batch_size=16`.

One can observe that `model2` is usually insignificantly better 

In [25]:
model1.fit(ds_train.batch(16), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc114161f10>

In [26]:
model2.fit(ds_train.batch(16), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc10c31ff10>

## Train again and monitor the validation loss (and accuracy)

In [27]:
model1.fit(ds_train.batch(16), epochs=5, validation_data=ds_test.batch(64))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc1046d0f50>

In [28]:
model2.fit(ds_train.batch(16), epochs=5, validation_data=ds_test.batch(64))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc1046dba10>

One can see above the logged values of `val_loss` and `val_accuracy`

## Analyse the confusion_matrix

In [29]:
train_images = ds_train.map(lambda img,label: img)
train_labels = ds_train.map(lambda img,label: label)

In [30]:
predictions = model1.predict_classes( train_images.batch(32), batch_size=None )
tf.math.confusion_matrix( list(train_labels), predictions )

<tf.Tensor: shape=(10, 10), dtype=int32, numpy=
array([[1841,  189,  206,   36,  187,  111,   88,  365, 1569,  408],
       [ 214, 2084,   78,   25,  116,   82,  125,  239,  578, 1459],
       [ 414,  126,  717,   80, 1428,  391,  591,  658,  425,  170],
       [ 125,  191,  295,  341,  601, 1214,  696,  605,  445,  487],
       [ 221,  123,  352,   36, 2184,  188,  561,  843,  313,  179],
       [  82,  137,  399,  218,  641, 1703,  446,  718,  396,  260],
       [  35,  102,  170,  152, 1100,  416, 2231,  318,  172,  304],
       [ 163,  182,  176,   38,  656,  282,  111, 2775,  245,  372],
       [ 253,  207,   60,   14,   85,  140,   42,   78, 3600,  521],
       [ 212,  572,   70,    9,   77,   92,  117,  269,  674, 2908]],
      dtype=int32)>

In [31]:
predictions = model2.predict_classes( train_images.batch(32), batch_size=None )


## Remember to use callbacks

In [32]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ReduceLROnPlateau, ModelCheckpoint

In [33]:
# Lets start from scratch

model1 = Sequential([
    Flatten(input_shape=(32, 32, 3)),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])
model2 = Sequential([
    Flatten(input_shape=(32, 32, 3)),
    Dense(128, activation='relu'),
    Dense(10) # no more softmax activation
])

model1.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

model2.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [34]:
# stop training when 5 epochs does not improve validation loss
early_stopping = EarlyStopping(monitor='val_loss', verbose=1, patience=5)
# logging tensorboard values
tensorboard1 = TensorBoard('logs/mlp-model1')
tensorboard2 = TensorBoard('logs/mlp-model2')
# reduce learning rate when monitored values does not improve
reduce_lr = ReduceLROnPlateau(monitor='val_loss', verbose=1, patience=3)  # reduce_lr.patience should be smaller than early_stopping.patience

# save the best model
model1_checkpoint = ModelCheckpoint('model1.h5', save_best_only=True)
model2_checkpoint = ModelCheckpoint('model2.h5', save_best_only=True)

In [35]:
model1.fit(
    ds_train.batch(16), epochs=100, validation_data=ds_test.batch(64),
    callbacks=[early_stopping, tensorboard1, reduce_lr, model1_checkpoint]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 00053: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 00058: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 59/100

<tensorflow.python.keras.callbacks.History at 0x7fc1046926d0>

In [36]:
model2.fit(
    ds_train.batch(16), epochs=100, validation_data=ds_test.batch(64),
    callbacks=[early_stopping, tensorboard2, reduce_lr, model2_checkpoint]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 00036: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 00043: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 00047: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 00050: ReduceLROnPlateau redu

<tensorflow.python.keras.callbacks.History at 0x7fc1045b4f90>

## Load the best models

In [37]:
from tensorflow.keras.models import load_model
model1 = load_model('model1.h5')
model2 = load_model('model2.h5')

... and evaluate them

In [38]:
model1.evaluate(ds_train.batch(64)), model1.evaluate(ds_test.batch(64))

    157/Unknown - 1s 4ms/step - loss: 1.5944 - accuracy: 0.4380

([1.5282324253750579, 0.45964], [1.594430197576049, 0.438])

In [39]:
model2.evaluate(ds_train.batch(64)), model2.evaluate(ds_test.batch(64))

    157/Unknown - 0s 3ms/step - loss: 1.5542 - accuracy: 0.4481

([1.48145048377459, 0.4705], [1.5542375684543779, 0.4481])

# We have 43% (41%) accuracy on train (test) dataset

# CNN model

In [43]:
from tensorflow.keras.layers import Convolution2D, ReLU, MaxPooling2D

# Lets start with very poor convolutional network
model_cnn = Sequential([
    Convolution2D(filters=6, kernel_size=(3,3), padding='same', input_shape=(32,32,3)),
    ReLU(),
    MaxPooling2D(),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10) # no more softmax activation
])
model_cnn.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 32, 32, 6)         168       
_________________________________________________________________
re_lu_1 (ReLU)               (None, 32, 32, 6)         0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 16, 16, 6)         0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 1536)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)               196736    
_________________________________________________________________
dense_11 (Dense)             (None, 10)                1290      
Total params: 198,194
Trainable params: 198,194
Non-trainable params: 0
________________________________________________

In [41]:
model_cnn.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [42]:
early_stopping = EarlyStopping(monitor='val_loss', verbose=1, patience=5)
tensorboard = TensorBoard('logs/cnn-model')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', verbose=1, patience=3)
model_checkpoint = ModelCheckpoint('model_cnn.h5', save_best_only=True)

model_cnn.fit(
    ds_train.batch(16), epochs=100, validation_data=ds_test.batch(64),
    callbacks=[early_stopping, tensorboard, reduce_lr, model_checkpoint]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fc10420df10>

# We have 75% (60%) accuracy on train (test) dataset

# Conclusion

* MLP (dense) model is not good a~choice for images
* CNN is usually much better feature extractor
* Be carefull with overfitting - one can observe that training loss became smaller and smaller quite fast; however validation loss didn't decrease - that's why we used **early stopping**
* Our CNN was very poor - try with more filters and more conv layers