In [0]:
import tensorflow as tf
import numpy as np

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

# add empty color dimension
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

In [0]:
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import *
from tensorflow.python.keras.optimizers import *

def create_model():
    model = Sequential()
    model.add(Conv2D(filters=64, kernel_size=(3,3), padding='same', activation='relu', input_shape=x_train.shape[1:]))
    for i in range(2, 5):
        model.add(Conv2D(filters=64*i, kernel_size=(3,3), padding='same', activation='relu'))
        model.add(Conv2D(filters=64*i, kernel_size=(3,3), padding='same', activation='relu'))
        model.add(MaxPooling2D())
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation='linear'))
    model.add(Softmax())

    model.summary()
    
    return model

# Experiment 1: `tf.keras.optimizers.Adam`

**Use TPU runtime for the next 2 experiments**

- Erratic validation loss
- Lowest validation loss reached after 17 epochs (0.3026)
- Fast epochs (3 seconds)

In [0]:
from numpy.random import seed
seed(1337)
from tensorflow import set_random_seed
set_random_seed(1337)

from tensorflow.keras.optimizers import Adam

import os
model = tf.contrib.tpu.keras_to_tpu_model(
    create_model(),
    strategy=tf.contrib.tpu.TPUDistributionStrategy(
        tf.contrib.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    )
)
model.compile(
    optimizer=Adam(lr=1e-3),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['sparse_categorical_accuracy']
)

model.fit(x_train, y_train,
          batch_size=1024,
          epochs=25,
          validation_data=(x_test, y_test),
          shuffle=True)

# Experiment 2: `tf.train.AdamOptimizer`

- Monotonically decreasing validation loss
- Lowest validation loss reached after 9 epochs (0.2082)
- Slower epochs (7 seconds)

In [0]:
from numpy.random import seed
seed(1337)
from tensorflow import set_random_seed
set_random_seed(1337)

import os
model = tf.contrib.tpu.keras_to_tpu_model(
    create_model(),
    strategy=tf.contrib.tpu.TPUDistributionStrategy(
        tf.contrib.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    )
)
model.compile(
    optimizer=tf.train.AdamOptimizer(learning_rate=1e-3),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['sparse_categorical_accuracy']
)

model.fit(x_train, y_train,
          batch_size=1024,
          epochs=25,
          validation_data=(x_test, y_test),
          shuffle=True)

# Experiment 3: `tf.keras.optimizers.Adam` on GPU

**Switch to GPU runtime for the next 2 experiments**

On GPU, both optimizer types show the same results

- Monotonically decreasing validation loss
- Lowest validation loss reached after 10 epochs (0.2073)
- Epochs slower than TPU (since I'm using a K80 on Colab) (37 seconds)
- On a V100 I noticed quite a big speed difference as well for the two different optimizer implementations (where `tf.keras.optimizers.Adam` was faster). On a K80 it doesn't seem to matter much

In [0]:
from numpy.random import seed
seed(1337)
from tensorflow import set_random_seed
set_random_seed(1337)

from tensorflow.keras.optimizers import Adam

import os
model = create_model()

model.compile(
    optimizer=Adam(lr=1e-3),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['sparse_categorical_accuracy']
)

model.fit(x_train, y_train,
          batch_size=1024,
          epochs=25,
          validation_data=(x_test, y_test),
          shuffle=True)

# Experiment 4: `tf.train.AdamOptimizer` on GPU

- Monotonically decreasing validation loss
- Lowest validation loss reached after 8 epochs (0.2152)
- Epochs slower than TPU (since I'm using a K80 on Colab) (38 seconds)

In [0]:
from numpy.random import seed
seed(1337)
from tensorflow import set_random_seed
set_random_seed(1337)

from tensorflow.keras.optimizers import Adam

import os
model = create_model()

model.compile(
    optimizer=tf.train.AdamOptimizer(learning_rate=1e-3),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['sparse_categorical_accuracy']
)

model.fit(x_train, y_train,
          batch_size=1024,
          epochs=25,
          validation_data=(x_test, y_test),
          shuffle=True)