# Iceberg Classification Step 3b: Ablation Study
The following code includes demonstration for:
- get data from ``feature store``
- training with maggy
- ablation study

In [1]:
import tensorflow as tf
import hops
from hops import tensorboard
from hops import model as hopsworks_model
import hsfs
import maggy
import maggy.version

# SparkSession available as 'spark'
print(
    f"-----------------------------------------------\n" \
    f"This notebook is tested with:\n" \
    f"  - TensorFlow {tf.__version__}.\n" \
    f"  - Hopsworks {hops.__version__}.\n" \
    f"  - Maggy {maggy.version.__version__}.\n" \
    f"  - Spark {spark.version}.\n"
)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
70,application_1623744201905_0003,pyspark,idle,Link,Link


SparkSession available as 'spark'.
-----------------------------------------------
This notebook is tested with:
  - TensorFlow 2.4.1.
  - Hopsworks 2.2.0.1.
  - Maggy 1.0.0rc0.
  - Spark 2.4.3.2.

In [2]:
def create_model():
    """Returns a classification model for ablation study.
    
    Parameters:
    - input_shape(tuple): input shape of the CNN model.
    
    Returns:
    - a TensorFlow keras model that is not compiled yet.
    
    """
    # input_shape of the model
    INPUT_SHAPE= (75, 75, 3)
    
    model = tf.keras.models.Sequential()
    
    #Conv Layer 1
    model.add(tf.keras.layers.Conv2D(16, kernel_size=(3, 3), activation='relu', input_shape=INPUT_SHAPE, name='my_conv_1'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2,2), name='my_maxpool_1'))
    model.add(tf.keras.layers.Dropout(0.2, name='my_dropout_1'))

    #Conv Layer 2
    model.add(tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', name='my_conv_2'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='my_maxpool_2'))
    model.add(tf.keras.layers.Dropout(0.2, name='my_dropout_2'))

    #Conv Layer 3
    model.add(tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', name='my_conv_3'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='my_maxpool_3'))
    model.add(tf.keras.layers.Dropout(0.2, name='my_dropout_3'))

    #Conv Layer 4
    model.add(tf.keras.layers.Conv2D(16, kernel_size=(3, 3), activation='relu', name='my_conv_4'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='my_maxpool_4'))
    model.add(tf.keras.layers.Dropout(0.2, name='my_dropout_4'))

    #Flatten the data for upcoming dense layers
    model.add(tf.keras.layers.Flatten())

    #Dense Layers
    model.add(tf.keras.layers.Dense(256))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.2))

    #Dense Layer 2
    model.add(tf.keras.layers.Dense(128))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.2))

    #Sigmoid Layer
    model.add(tf.keras.layers.Dense(1))
    model.add(tf.keras.layers.Activation('sigmoid'))
    return model

In [3]:
from maggy.callbacks import KerasBatchEnd

def create_datasets(epochs):
    # Establish a connection with the Hopsworks feature store
    #     engine='training' is needed so that the executors in Spark can connect to feature store
    connection = hsfs.connection(engine='training') 
    # Get the feature store handle for the project's feature store
    fs = connection.get_feature_store()
    
    # Name of the training dataset in feature store
    TRAIN_FS_NAME = 'train_tfrecords_iceberg_classification_dataset'
    # Name of the test dataset in feature sotre
    TEST_FS_NAME = 'test_tfrecords_iceberg_classification_dataset'
    # Number of epochs to training
    EPOCHS = epochs
    # Training batch size
    TRAIN_BATCH_SIZE = 32
    # Evaluation batch size
    EVAL_BATCH_SIZE = 1
    # Shuffle buffer size for TensorFlow dataset
    SHUFFLE_BUFFER_SIZE = 10000
    
    def decode(sample):
        """Decode each training sample.
        
        This funtionc decode each sample and return it in a format that is ready for training.
        
        Parameters:
        - sample: raw features of a data sample stored in a dictionary-like object
        
        Returns:
        - x: 'band_1', 'band_2', and 'band_avg' will be reshaped and stacked
             and form the input of the model
        - y: 'is_iceberg' will be the output of the model.
        """
        
        name_list = ['band_1', 'band_2', 'band_avg', 'is_iceberg']
        x = tf.stack([sample[name_list[0]], sample[name_list[1]], sample[name_list[2]]], axis=1)
        x = tf.reshape(x, [75, 75, 3])
        y = [tf.cast(sample[name_list[3]], tf.float32)]
        return x,y
    
    # Training dataset in TFRecord format
    train_ds = fs.get_training_dataset(name=TRAIN_FS_NAME).tf_data(target_name='is_iceberg')
    train_ds = train_ds.tf_record_dataset(process=False, batch_size=TRAIN_BATCH_SIZE, num_epochs=EPOCHS)
    train_ds_processed = train_ds.map(decode).shuffle(SHUFFLE_BUFFER_SIZE).repeat(EPOCHS).cache().batch(TRAIN_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    
    # Evaluation dataset in TFRecord format
    eval_ds = fs.get_training_dataset(name=TEST_FS_NAME).tf_data(target_name='is_iceberg')
    eval_ds = eval_ds.tf_record_dataset(process=False, batch_size=EVAL_BATCH_SIZE, num_epochs=EPOCHS)
    eval_ds_processed = eval_ds.map(decode).shuffle(SHUFFLE_BUFFER_SIZE).repeat(EPOCHS).cache().batch(EVAL_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    
    return train_ds_processed, eval_ds_processed

In [4]:
def train_fn(dataset_function, model_function):
    """"Wrapper function for the experiment.
    
    Parameters:
    - learning_rate: learning rate of the optimizer during training.
    
    Returns:
    - metrics: training summary.
    
    """
    # ---------------- Initialization ----------------
   
    # Clear session info
    tf.keras.backend.clear_session()
    
    # Set up visible GPU
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
      try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
          tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
      except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
    
    # ---------------- Initialization ----------------
    
    # ---------------- Hyperparameters ----------------
    # Number of epochs to training
    EPOCHS = 3
    # learning rate of the optimizer during training
    LEARNING_RATE = 0.001
    # ---------------- Hyperparameters ----------------
    
    # ---------------- Training Process ----------------
    # use this strategy to test your code before switching to other strategies which actually distributes to multiple devices/machines.
    #strategy = tf.distribute.OneDeviceStrategy(device='/gpu:0')

    # construct model under distribution strategy scope
    #with strategy.scope(): 
    model = model_function()
    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE), loss='binary_crossentropy',  metrics=['accuracy'])

    # Define the TensorBoard and ModelCheckpoint callbacks.
#     callbacks = []
#     callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=tensorboard.logdir(), histogram_freq=0, write_graph=True, write_images=True, profile_batch='5,10'))
#     callbacks.append(tf.keras.callbacks.ModelCheckpoint(tensorboard.logdir() + '/checkpoint-{epoch}.h5', monitor='acc', verbose=0, save_best_only=True))
    # maggy: REPORTER API through keras callback
#     callbacks.append(KerasBatchEnd(reporter, metric='acc'))
#     callbacks = [KerasBatchEnd(reporter, metric='acc')]

    train_ds_processed, eval_ds_processed = dataset_function(EPOCHS)

    # Start training the model.
    history = model.fit(
        train_ds_processed,
        epochs=EPOCHS,
        verbose=1,
        validation_data=eval_ds_processed,
#         callbacks=callbacks
    )
    
    # 'metrics' is the return value of this function;
    #     The values in 'metrics' will be printed to the notebook cell that launch the experiment
    metrics = {
        'train_loss': history.history['loss'][-1],
        'train_accuracy': history.history['accuracy'][-1],
        'val_loss': history.history['val_loss'][-1],
        'val_accuracy': history.history['val_accuracy'][-1],
    } 
    # ---------------- Training Process ----------------
    
    return metrics['val_accuracy']

## Ablation Study Part

In [5]:
from maggy.ablation import AblationStudy

# create an AblationStudy instance
iceberg_ablation = AblationStudy('iceberg', training_dataset_version=1, label_name="is_iceberg")

# pass the model generator function to ablation study
iceberg_ablation.set_dataset_generator(create_datasets)
# set the base model generator
iceberg_ablation.model.set_base_model_generator(create_model)

# add layers to the ablation study
iceberg_ablation.model.layers.include(['my_conv_1', 'my_conv_2', 'my_conv_3', 'my_conv_4'])
#iceberg_ablation.model.layers.include(['my_maxpool_1', 'my_maxpool_2', 'my_maxpool_3', 'my_maxpool_4'])
#iceberg_ablation.model.layers.include(['my_dropout_1', 'my_dropout_2', 'my_dropout_3', 'my_dropout_4'])

iceberg_ablation.model.layers.print_all()

# add a layer group using a prefix

# iceberg_ablation.model.layers.include_groups(prefix='my_conv')
# iceberg_ablation.model.layers.include_groups(prefix='my_maxpool')
# iceberg_ablation.model.layers.include_groups(prefix='my_dropout')

# iceberg_ablation.model.layers.print_all_groups()

print('\n\nAblation Study summary: \n {}'.format(iceberg_ablation.to_dict()))

Included single layers are: 

my_conv_1
my_conv_3
my_conv_4
my_conv_2


Ablation Study summary: 
 {'training_dataset_name': 'iceberg', 'training_dataset_version': 1, 'label_name': 'is_iceberg', 'included_features': [], 'included_layers': ['my_conv_1', 'my_conv_3', 'my_conv_4', 'my_conv_2'], 'custom_dataset_generator': True}

In [6]:
# Create a config for lagom
from maggy.experiment_config import AblationConfig

config = AblationConfig(name='Iceberg_ship_classifier_ablation_study', ablation_study=iceberg_ablation, ablator='loco', description='Ablation_Study', hb_interval=1)

In [7]:
from maggy import experiment

# launch the experiment
result = experiment.lagom(train_fn=train_fn, config=config)

HBox(children=(FloatProgress(value=0.0, description='Maggy experiment', max=5.0, style=ProgressStyle(descriptiâ€¦

0: Connected. Call `.close()` to terminate connection gracefully.
0: 
0: 
0: Epoch 1/3
0: 
0: 
1: Connected. Call `.close()` to terminate connection gracefully.
1: 
1: 
1: Epoch 1/3
1: 
1: 
0: 
0: 
1: 
1: 
0: Epoch 2/3
1: Epoch 2/3
0: Epoch 3/3
1: Epoch 3/3
0: Connected. Call `.close()` to terminate connection gracefully.
0: Epoch 1/3
1: Connected. Call `.close()` to terminate connection gracefully.
1: Epoch 1/3
0: Epoch 2/3
1: Epoch 2/3
0: Epoch 3/3
1: Epoch 3/3
1: Connected. Call `.close()` to terminate connection gracefully.
1: Epoch 1/3
1: Epoch 2/3
1: Epoch 3/3
You are running Maggy on Hopsworks.

------ LOCO Results ------ 
BEST Config Excludes {"ablated_feature": "None", "ablated_layer": "my_conv_4"} -- metric 0.8181818127632141
WORST Config Excludes {"ablated_feature": "None", "ablated_layer": "my_conv_1"} -- metric 0.7184750437736511
AVERAGE metric -- 0.7612903237342834
Total Job Time 0 hours, 3 minutes, 24 seconds

Finished experiment.


# End of Step 3b