# Iceberg Classification Step 3a: Model Training with Maggy - Hyperparameter Optimization
The following code includes demonstration for:
- get data from ``feature store``
- training with ``TFRecord`` on a single GPU
- hyperparameter optimization with ``maggy``

In [9]:
import tensorflow as tf
import hops
from maggy import tensorboard
from hops import model as hopsworks_model
import hsfs
import maggy
import maggy.version

# SparkSession available as 'spark'
print(
    f"-----------------------------------------------\n" \
    f"This notebook is tested with:\n" \
    f"  - TensorFlow {tf.__version__}.\n" \
    f"  - Hopsworks {hops.__version__}.\n" \
    f"  - Maggy {maggy.version.__version__}.\n" \
    f"  - Spark {spark.version}.\n"
)

-----------------------------------------------
This notebook is tested with:
  - TensorFlow 2.4.1.
  - Hopsworks 2.2.0.1.
  - Maggy 1.0.0rc0.
  - Spark 2.4.3.2.

In [10]:
def create_model(kernel, pool, dropout, input_shape):
    """Returns a CNN model for image classification.
    
    Parameters:
    - input_shape(tuple): input shape of the CNN model.
    
    Returns:
    - a TensorFlow keras model that is not compiled yet.
    
    """
        
    model = tf.keras.models.Sequential()
    
    #Conv Layer 1
    model.add(tf.keras.layers.Conv2D(16, kernel_size=(kernel, kernel), activation='relu', input_shape=input_shape))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(pool, pool), strides=(2,2)))
    model.add(tf.keras.layers.Dropout(dropout))

    #Conv Layer 2
    model.add(tf.keras.layers.Conv2D(32, kernel_size=(kernel, kernel), activation='relu' ))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(pool, pool), strides=(2, 2)))
    model.add(tf.keras.layers.Dropout(dropout))
    
    #Conv Layer 3
    model.add(tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(tf.keras.layers.Dropout(0.2))

    #Conv Layer 4
    model.add(tf.keras.layers.Conv2D(16, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(tf.keras.layers.Dropout(0.2))

    #Flatten the data for upcoming dense layers
    model.add(tf.keras.layers.Flatten())

    #Dense Layers
    model.add(tf.keras.layers.Dense(256))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.2))

    #Dense Layer 2
    model.add(tf.keras.layers.Dense(128))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.2))

    #Sigmoid Layer
    model.add(tf.keras.layers.Dense(1))
    model.add(tf.keras.layers.Activation('sigmoid'))
    return model

In [11]:
from maggy.callbacks import KerasBatchEnd

In [12]:
def train_fn(kernel, pool, dropout, reporter):
    """"Wrapper function for the experiment.
    
    Parameters:
    - learning_rate: learning rate of the optimizer during training.
    
    Returns:
    - metrics: training summary.
    
    """
    
    # ---------------- Initialization ----------------
    # Establish a connection with the Hopsworks feature store
    #     engine='training' is needed so that the executors in Spark can connect to feature store
    connection = hsfs.connection(engine='training') 
    # Get the feature store handle for the project's feature store
    fs = connection.get_feature_store()
    
    # Clear session info
    tf.keras.backend.clear_session()
    
    # Set up visible GPU
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
      try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
          tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
      except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
    
    # ---------------- Initialization ----------------
    
    # ---------------- Hyperparameters ----------------
    # Number of epochs to training
    EPOCHS = 3
    # Training batch size
    TRAIN_BATCH_SIZE = 32
    # Evaluation batch size
    EVAL_BATCH_SIZE = 1
    # Shuffle buffer size for TensorFlow dataset
    SHUFFLE_BUFFER_SIZE = 10000
    # learning rate of the optimizer during training
    LEARNING_RATE = 0.001
    # input_shape of the model
    INPUT_SHAPE= (75, 75, 3)
    # Name of the training dataset in feature store
    TRAIN_FS_NAME = 'train_tfrecords_iceberg_classification_dataset'
    # Name of the test dataset in feature sotre
    TEST_FS_NAME = 'test_tfrecords_iceberg_classification_dataset'
    
    # ---------------- Hyperparameters ----------------
    
    # ---------------- Training Process ----------------
    # use this strategy to test your code before switching to other strategies which actually distributes to multiple devices/machines.
    #strategy = tf.distribute.OneDeviceStrategy(device='/gpu:0')
    #strategy = tf.distribute.MultiWorkerMirroredStrategy()
    # construct model under distribution strategy scope
    #with strategy.scope(): 
    model = create_model(kernel, pool, dropout, INPUT_SHAPE)
    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE), loss='binary_crossentropy',  metrics=['accuracy'])

    callbacks = [KerasBatchEnd(reporter, metric='acc')]

    
    def decode(sample):
        """Decode each training sample.
        
        This funtionc decode each sample and return it in a format that is ready for training.
        
        Parameters:
        - sample: raw features of a data sample stored in a dictionary-like object
        
        Returns:
        - x: 'band_1', 'band_2', and 'band_avg' will be reshaped and stacked
             and form the input of the model
        - y: 'is_iceberg' will be the output of the model.
        """
        
        name_list = ['band_1', 'band_2', 'band_avg', 'is_iceberg']
        x = tf.stack([sample[name_list[0]], sample[name_list[1]], sample[name_list[2]]], axis=1)
        x = tf.reshape(x, [75, 75, 3])
        y = [tf.cast(sample[name_list[3]], tf.float32)]
        return x,y
    
    # Training dataset in TFRecord format
    train_ds = fs.get_training_dataset(name=TRAIN_FS_NAME).tf_data(target_name='is_iceberg')
    train_ds = train_ds.tf_record_dataset(process=False, batch_size=TRAIN_BATCH_SIZE, num_epochs=EPOCHS)
    train_ds_processed = train_ds.map(decode).shuffle(SHUFFLE_BUFFER_SIZE).repeat(EPOCHS).cache().batch(TRAIN_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    
    # Evaluation dataset in TFRecord format
    eval_ds = fs.get_training_dataset(name=TEST_FS_NAME).tf_data(target_name='is_iceberg')
    eval_ds = eval_ds.tf_record_dataset(process=False, batch_size=EVAL_BATCH_SIZE, num_epochs=EPOCHS)
    eval_ds_processed = eval_ds.map(decode).shuffle(SHUFFLE_BUFFER_SIZE).repeat(EPOCHS).cache().batch(EVAL_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    
    # Start training the model.
    history = model.fit(
        train_ds_processed,
        epochs=EPOCHS,
        verbose=1,
        validation_data=eval_ds_processed,
        callbacks=callbacks
    )
    
    # 'metrics' is the return value of this function;
    #     The values in 'metrics' will be printed to the notebook cell that launch the experiment
    metrics = {
        'train_loss': history.history['loss'][-1],
        'train_accuracy': history.history['accuracy'][-1],
        'val_loss': history.history['val_loss'][-1],
        'val_accuracy': history.history['val_accuracy'][-1],
    } 

    # ---------------- Training Process ----------------
    
#     # ---------------- Save and Export ----------------
#     # Export model as savedModel
# #     export_path = tensorboard.logdir() + '/SavedModel'

#     tf.keras.models.save_model(
#         model,
#         export_path,
#         overwrite=True,
#         include_optimizer=True,
#         save_format=None,
#         signatures=None,
#         options=None
#     )
    
#     # 'hopsworks_model' is the moudle provided by hopsworks for exporting models
#     # 'hopsworks_model' is a different name of 'hops.model' to avoid name clashes
#     hopsworks_model.export(export_path, 'ship_iceberg_classifier', metrics=metrics)
#     # ---------------- Save and Export ----------------
    
    return metrics['val_loss']

## Define the search space for hyperparameter optimization

In [13]:
from maggy import Searchspace

# The searchspace can be instantiated with parameters
sp = Searchspace()

# Or additional parameters can be added one by one
sp.add('kernel', ('INTEGER', [3, 4]))
sp.add('pool', ('INTEGER', [2, 3]))
sp.add('dropout', ('DOUBLE', [0.10, 0.50]))

Hyperparameter added: kernel
Hyperparameter added: pool
Hyperparameter added: dropout

## Launch the hyperparameter optimization

In [14]:
from maggy import experiment
from maggy.experiment_config import OptimizationConfig

In [15]:
config = OptimizationConfig(
                            num_trials=2,
                            optimizer='randomsearch',
                            searchspace=sp,
                            direction='max',
                            es_interval=1,
                            es_min=2,
                            hb_interval=5,
                            name='Iceberg_Classification_Maggy'
                        )

In [16]:
result = experiment.lagom(train_fn=train_fn, config=config)

HBox(children=(FloatProgress(value=0.0, description='Maggy experiment', max=2.0, style=ProgressStyle(descripti…

0: Connected. Call `.close()` to terminate connection gracefully.
0: 
0: 
0: Epoch 1/3
0: 
0: 
1: Connected. Call `.close()` to terminate connection gracefully.
1: 
1: 
1: Epoch 1/3
1: 
1: 
0: 
0: 
1: 
1: 
0: Epoch 2/3
1: Epoch 2/3
0: Epoch 3/3
1: Epoch 3/3

------ RandomSearch Results ------ direction(max) 
BEST combination {"kernel": 4, "pool": 3, "dropout": 0.4383837440120528} -- metric 0.4286040663719177
WORST combination {"kernel": 4, "pool": 3, "dropout": 0.2679583679616675} -- metric 0.36450064182281494
AVERAGE metric -- 0.39655235409736633
EARLY STOPPED Trials -- 0
Total job time 0 hours, 1 minutes, 47 seconds

Finished experiment.


# End of Step 3a