# Iceberg Classification Step 3c: Model Training in distributed training
The following code includes demonstration for:
- get data from ``feature store``
- training with ``TFRecord``
- distributed training

In [16]:
import os
import json
import tensorflow as tf
import uuid

import hops
from hops import model as hopsworks_model

from maggy import tensorboard
import maggy
from maggy.callbacks import KerasBatchEnd

import hsfs

# SparkSession available as 'spark'
print(
    f"-----------------------------------------------\n" \
    f"This notebook is tested with:\n" \
    f"  - TensorFlow {tf.__version__}.\n" \
    f"  - Hopsworks {hops.__version__}.\n" \
    f"  - Maggy {maggy.version.__version__}.\n" \
    f"  - Spark {spark.version}.\n"
)



An error was encountered:
module 'maggy' has no attribute 'version'
Traceback (most recent call last):
AttributeError: module 'maggy' has no attribute 'version'



In [17]:
def create_model(input_shape, kernel, pool, dropout):
    """Returns a CNN model for image classification.
    
    Parameters:
    - input_shape(tuple): input shape of the CNN model.
    
    Returns:
    - a TensorFlow keras model that is not compiled yet.
    
    """
    model = tf.keras.models.Sequential()
    
    # Conv Layer 1
    model.add(tf.keras.layers.Conv2D(16, kernel_size=(kernel, kernel), activation='relu', input_shape=input_shape))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(pool, pool), strides=(2,2)))
    model.add(tf.keras.layers.Dropout(dropout))

    # Conv Layer 2
    model.add(tf.keras.layers.Conv2D(32, kernel_size=(kernel, kernel), activation='relu' ))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(pool, pool), strides=(2, 2)))
    model.add(tf.keras.layers.Dropout(dropout))

    # Conv Layer 3
    model.add(tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(tf.keras.layers.Dropout(0.2))

    # Conv Layer 4
    model.add(tf.keras.layers.Conv2D(16, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(tf.keras.layers.Dropout(0.2))

    # Flatten the data for upcoming dense layers
    model.add(tf.keras.layers.Flatten())

    # Dense Layers
    model.add(tf.keras.layers.Dense(256))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.2))

    # Dense Layer 2
    model.add(tf.keras.layers.Dense(128))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.2))

    # Sigmoid Layer
    model.add(tf.keras.layers.Dense(1))
    model.add(tf.keras.layers.Activation('sigmoid'))
    
    return model

In [18]:
def train_fn(model, train_set, test_set, hparams, reporter):
    """"Wrapper function for the experiment.
    
    Returns:
    - metrics: training summary.
    
    """

    # Establish a connection with the Hopsworks feature store
    #     engine='training' is needed so that the executors in Spark can connect to feature store
    connection = hsfs.connection(engine='training') 
    # Get the feature store handle for the project's feature store
    fs = connection.get_feature_store()
    
    # Clear session info
    #tf.keras.backend.clear_session()
    
    # ---------------- Hyperparameters ----------------
    # Number of epochs to training
    EPOCHS = 3
    # Define per device batch size
    batch_size_per_replica = 32
    # Training batch size
    TRAIN_BATCH_SIZE = 32
    # Evaluation batch size
    EVAL_BATCH_SIZE = 1
    # Shuffle buffer size for TensorFlow dataset
    SHUFFLE_BUFFER_SIZE = 10000
    # Optimizer learning rate
    LEARNING_RATE = 0.001
    # input_shape of the model
    INPUT_SHAPE= (75, 75, 3)    
    # Name of the training dataset in feature store
    TRAIN_FS_NAME = 'train_tfrecords_iceberg_classification_dataset'
    # Name of the test dataset in feature sotre
    TEST_FS_NAME = 'test_tfrecords_iceberg_classification_dataset'
    # ---------------- Hyperparameters ----------------
    
    # ---------------- Training Process ----------------
    # construct model under distribution strategy scope
    
    model = create_model(INPUT_SHAPE, hparams["kernel"], hparams['pool'], hparams['dropout'])
    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE), loss='binary_crossentropy',  metrics=['accuracy'])
    
    # Define the TensorBoard and ModelCheckpoint callbacks.
    tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tensorboard.logdir())
    
    
    callbacks = [tb_callback]
    
    def decode(sample):
        """Decode each training sample.
        
        This funtionc decode each sample and return it in a format that is ready for training.
        
        Parameters:
        - sample: raw features of a data sample stored in a dictionary-like object
        
        Returns:
        - x: 'band_1', 'band_2', and 'band_avg' will be reshaped and stacked
             and form the input of the model
        - y: 'is_iceberg' will be the output of the model.
        """
        
        name_list = ['band_1', 'band_2', 'band_avg', 'is_iceberg']
        x = tf.stack([sample[name_list[0]], sample[name_list[1]], sample[name_list[2]]], axis=1)
        x = tf.reshape(x, [75, 75, 3])
        y = [tf.cast(sample[name_list[3]], tf.float32)]
        return x,y
    
    # Training dataset in TFRecord format
    train_ds = fs.get_training_dataset(name=TRAIN_FS_NAME).tf_data(target_name='is_iceberg')
    train_ds = train_ds.tf_record_dataset(process=False, batch_size=TRAIN_BATCH_SIZE, num_epochs=EPOCHS)
    train_ds_processed = train_ds.map(decode).shuffle(SHUFFLE_BUFFER_SIZE).repeat(EPOCHS).cache().batch(TRAIN_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    
    # Evaluation dataset in TFRecord format
    eval_ds = fs.get_training_dataset(name=TEST_FS_NAME).tf_data(target_name='is_iceberg')
    eval_ds = eval_ds.tf_record_dataset(process=False, batch_size=EVAL_BATCH_SIZE, num_epochs=EPOCHS)
    eval_ds_processed = eval_ds.map(decode).shuffle(SHUFFLE_BUFFER_SIZE).repeat(EPOCHS).cache().batch(EVAL_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    
    # Start training the model.
    history = model.fit(
        train_ds_processed,
        epochs=EPOCHS,
        verbose=1,
        validation_data=eval_ds_processed,
        callbacks=callbacks
    )
    
    # 'metrics' is the return value of this function;
    # The values in 'metrics' will be printed to the notebook cell that launch the experiment
    # Maggy takes only the first value as a metric, you can change it regarding you needs
    metrics = history.history['loss'][-1]
    metrics_dict = {
        'train_loss': history.history['loss'][-1],
        'train_accuracy': history.history['accuracy'][-1],
        'val_loss': history.history['val_loss'][-1],
        'val_accuracy': history.history['val_accuracy'][-1],
    } 
    # ---------------- Training Process ----------------
    
    # ---------------- Save and Export ----------------
    # Export model as savedModel
    
    
    if json.loads(os.environ['TF_CONFIG'])['task']['index'] == 0:  
        
        export_path = os.getcwd() + '/model-' + str(uuid.uuid4())
        print('Exporting trained model to: {}'.format(export_path))

        tf.saved_model.save(model, export_path)

        print(f'Export Path: {export_path}')
        hopsworks_model.export(export_path, 'ship_iceberg_classifier', metrics=metrics_dict)    

    # 'hopsworks_model' is the module provided by hopsworks for exporting models
    # 'hopsworks_model' is a different name of 'hops.model' to avoid name clashes
    
    #hopsworks_model.export(export_path, 'ship_iceberg_classifier', metrics=metrics)    
    
    return metrics

In [19]:
from maggy import experiment
from maggy.experiment_config import TfDistributedConfig

# define the constructor parameters of your model
hparams = {
    # hyper parameters from the HPO
    "kernel": 4, 
    "pool": 3, 
    "dropout": 0.4383837440120528
}

config = TfDistributedConfig(name="Iceberg_Ship_Classification_with_distributed_training", model=create_model((75, 75, 3), hparams["kernel"], hparams['pool'], hparams['dropout']).__class__, train_set=[], test_set=[], hparams=hparams)
experiment.lagom(train_fn=train_fn, config=config)

HBox(children=(FloatProgress(value=0.0, description='Maggy experiment', max=1.0, style=ProgressStyle(descripti…

1: All executors registered: True
0: All executors registered: True
1: Connected. Call `.close()` to terminate connection gracefully.
0: Connected. Call `.close()` to terminate connection gracefully.
0: 
1: 
1: 
1: Epoch 1/3
0: 
0: Epoch 1/3
0: 
0: 
1: 
1: 
1: 
1: 
0: 
0: 
0: Epoch 2/3
1: Epoch 2/3
1: Epoch 3/3
0: Epoch 3/3
0: Exporting trained model to: /srv/hops/hopsdata/tmp/nm-local-dir/usercache/demo_ml_meb10180__meb10180/appcache/application_1623744201905_0005/container_e11_1623744201905_0005_01_000007/model-bf0ac207-d76e-4839-81e6-756607292d6e
0: 
0: 
0: Export Path: /srv/hops/hopsdata/tmp/nm-local-dir/usercache/demo_ml_meb10180__meb10180/appcache/application_1623744201905_0005/container_e11_1623744201905_0005_01_000007/model-bf0ac207-d76e-4839-81e6-756607292d6e
0: Started copying local path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/demo_ml_meb10180__meb10180/appcache/application_1623744201905_0005/container_e11_1623744201905_0005_01_000007/model-bf0ac207-d76e-4839-81e6-75660

## END of the Step3c