#### Before we proceed, let's check that we're using the right image, that is, TensorFlow is available:

In [1]:
#! pip3 list | grep tensorflow 
#! pip3 install --user tensorflow==2.4.0
#! pip3 install --user ipywidgets nbconvert
#!python -m pip install --user --upgrade pip
#!pip3 install pandas scikit-learn keras tensorflow-datasets --user

#### To package the trainer in a container image, we shall need a file (on our cluster) that contains the code as well as a file with the resource definitition of the job for the Kubernetes cluster:

In [2]:
TRAINER_FILE = "tfjobheart.py"
KUBERNETES_FILE = "tfjob-heartdisease.yaml"

#### We also want to capture output from a cell with %%capture that usually looks like some-resource created. To that end, let's define a helper function:

In [3]:
import re

from IPython.utils.capture import CapturedIO


def get_resource(captured_io: CapturedIO) -> str:
    """
    Gets a resource name from `kubectl apply -f <configuration.yaml>`.

    :param str captured_io: Output captured by using `%%capture` cell magic
    :return: Name of the Kubernetes resource
    :rtype: str
    :raises Exception: if the resource could not be created
    """
    out = captured_io.stdout
    matches = re.search(r"^(.+)\s+created", out)
    if matches is not None:
        return matches.group(1)
    else:
        raise Exception(f"Cannot get resource as its creation failed: {out}. It may already exist.")

#### Load and Inspect the Data¶

In [4]:
import pandas as  pd
data = pd.read_csv("heart.csv")
data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


#### Train the Model in the Notebook

We want to train the model in a distributed fashion, we put all the code in a single cell. That way we can save the file and include it in a container image:

In [22]:
%%writefile $TRAINER_FILE
import argparse
import logging
import json
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler

from numpy.random import seed

import tensorflow as tf
tf.random.set_seed(221)
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import SGD, Adam, RMSprop

logging.getLogger().setLevel(logging.INFO)

def make_datasets_unbatched():
    data = pd.read_csv("heart.csv")
    data.head()
    
    data.apply(lambda x: sum(x.isnull()),axis=0)
    
    # List of variables with missing values
    
    vars_with_na=[var for var in data.columns if data[var].isnull().sum()>1]
    
    #Boolan variables
    bool_var=['sex','output','fbs', 'exng']
    #Categorical variables:cardinalty
    cat_var=['cp','restecg','slp', 'thall']
    #discrete variables
    num_var=['age', 'trtbps','chol', 'thalachh', 'oldpeak', 'caa']
    #remove outliers
    
    def removeOutlier(att, data):
        lowerbound = att.mean() - 3 * att.std()
        upperbound = att.mean() + 3 * att.std()
        #print('lowerbound: ',lowerbound,' -------- upperbound: ', upperbound )
        df1 = data[(att > lowerbound) & (att < upperbound)]
        #print((data.shape[0] - df1.shape[0]), ' number of outliers from ', data.shape[0] )
        #print(' ******************************************************')
        data = df1.copy()
        return data
    data = removeOutlier(data.trtbps, data)
    data = removeOutlier(data.chol, data)
    
    #resampling
    from sklearn.utils import resample
    
    # Separate Target Classes
    df_1 = data[data.output==1]
    df_2 = data[data.output==0]
    
    # Upsample minority class
    df_upsample_1 = resample(df_2, 
                                 replace=True,     # sample with replacement
                                 n_samples=163,    # to match majority class
                                 random_state=123) # reproducible results
    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_1, df_upsample_1])
    
    # Display new class counts
    df_upsampled.output.value_counts()
    
    x = df_upsampled.drop('output', axis = 1)
    y = df_upsampled['output']
    
    #Split dataset

    
    x_train,x_test, y_train, y_test = tts(x,y, test_size = 0.2, random_state = 111)
    
    #Scaling
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.fit_transform(x_test)
    
    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    train = train_dataset.cache().shuffle(2000).repeat()
    return train, test_dataset

def model(args):
    seed(1)
    model = Sequential()
    model.add(Dense(10, activation='relu', input_dim=13))
    #model.add(BatchNormalization())
    model.add(Dense(10, activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    model.summary()
    opt = args.optimizer
    model.compile(optimizer=opt,
                loss='binary_crossentropy',
                metrics=['accuracy'])
    tf.keras.backend.set_value(model.optimizer.learning_rate, args.learning_rate)
    return model


def main(args):
    # MultiWorkerMirroredStrategy creates copies of all variables in the model's
    # layers on each device across all workers
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
        communication=tf.distribute.experimental.CollectiveCommunication.AUTO)
    logging.debug(f"num_replicas_in_sync: {strategy.num_replicas_in_sync}")
    BATCH_SIZE_PER_REPLICA = args.batch_size
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
    
    # Datasets need to be created after instantiation of `MultiWorkerMirroredStrategy`
    train_dataset, test_dataset = make_datasets_unbatched()
    train_dataset = train_dataset.batch(batch_size=BATCH_SIZE)
    test_dataset = test_dataset.batch(batch_size=BATCH_SIZE)

    # See: https://www.tensorflow.org/api_docs/python/tf/data/experimental/DistributeOptions
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = \
    tf.data.experimental.AutoShardPolicy.DATA
    
    train_datasets_sharded  = train_dataset.with_options(options)
    test_dataset_sharded = test_dataset.with_options(options)
    
    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = model(args)
        # Keras' `model.fit()` trains the model with specified number of epochs and
        # number of steps per epoch. 
        multi_worker_model.fit(train_datasets_sharded,
                         epochs=50,
                         steps_per_epoch=30)

        eval_loss, eval_acc = multi_worker_model.evaluate(test_dataset_sharded, 
                                                    verbose=0, steps=10)
        # Log metrics for Katib
        logging.info("loss={:.4f}".format(eval_loss))
        logging.info("accuracy={:.4f}".format(eval_acc))
        
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size",
                      type=int,
                      default=32,
                      metavar="N",
                      help="Batch size for training (default: 128)")
    parser.add_argument("--learning_rate", 
                      type=float,  
                      default=0.1,
                      metavar="N",
                      help='Initial learning rate')
    parser.add_argument("--optimizer", 
                      type=str, 
                      default='adam',
                      metavar="N",
                      help='optimizer')
    parsed_args, _ = parser.parse_known_args()
    main(parsed_args)

Overwriting tfjobheart.py


#### That saves the file as defined by TRAINER_FILE but it does not run it.

Let's see if our code is correct by running it from within our notebook:

In [23]:
%run $TRAINER_FILE --optimizer 'adam'

INFO:tensorflow:Using MirroredStrategy with devices ('/device:CPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/device:CPU:0',)


INFO:tensorflow:Single-worker MultiWorkerMirroredStrategy with local_devices = ('/device:CPU:0',), communication = CommunicationImplementation.AUTO


INFO:tensorflow:Single-worker MultiWorkerMirroredStrategy with local_devices = ('/device:CPU:0',), communication = CommunicationImplementation.AUTO


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 10)                140       
_________________________________________________________________
dense_10 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 11        
Total params: 261
Trainable params: 261
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50

INFO:root:loss=1.2972
INFO:root:accuracy=0.9091


#### Create a Distributed TFJob


For large training jobs, we wish to run our trainer in a distributed mode. Once the notebook server cluster can access the Docker image from the registry, we can launch a distributed TF job.

The specification for a distributed TFJob is defined using YAML:

In [7]:
%%writefile $KUBERNETES_FILE
apiVersion: "kubeflow.org/v1"
kind: "TFJob"
metadata:
  name: "hrtd"
  namespace: ekemini # your-user-namespace
spec:
  cleanPodPolicy: None
  tfReplicaSpecs:
    Worker:
      replicas: 2
      restartPolicy: OnFailure
      template:
        metadata:
          annotations:
            sidecar.istio.io/inject: "false"
        spec:
          containers:
          - name: tensorflow
            # modify this property if you would like to use a custom image
            image: mavencodevv/tfjob_heart:v.0.1
            command:
                - "python"
                - "/tfjobheart.py"
                - "--batch_size=64"
                - "--learning_rate=0.1"
                - "--optimizer=adam"

Overwriting tfjob-heartdisease.yaml


#### Let's deploy the distributed training job:

In [9]:
%%capture tf_output --no-stderr
! kubectl create -f $KUBERNETES_FILE

In [10]:
TF_JOB = get_resource(tf_output)

##### To see the job status, use the following command:

In [11]:
! kubectl describe $TF_JOB

Name:         hrtd
Namespace:    ekemini
Labels:       <none>
Annotations:  <none>
API Version:  kubeflow.org/v1
Kind:         TFJob
Metadata:
  Creation Timestamp:  2021-04-13T05:59:54Z
  Generation:          1
  Managed Fields:
    API Version:  kubeflow.org/v1
    Fields Type:  FieldsV1
    fieldsV1:
      f:spec:
        .:
        f:cleanPodPolicy:
        f:tfReplicaSpecs:
          .:
          f:Worker:
            .:
            f:replicas:
            f:restartPolicy:
            f:template:
              .:
              f:metadata:
                .:
                f:annotations:
                  .:
                  f:sidecar.istio.io/inject:
              f:spec:
    Manager:      kubectl-create
    Operation:    Update
    Time:         2021-04-13T05:59:54Z
    API Version:  kubeflow.org/v1
    Fields Type:  FieldsV1
    fieldsV1:
      f:spec:
        f:successPolicy:
        f:tfReplicaSpecs:
          f:Worker:
            f:template:
              f:metadata:
     

In [12]:
! kubectl get pods -l job-name=hrtd

NAME            READY   STATUS              RESTARTS   AGE
hrtd-worker-0   0/1     ContainerCreating   0          2s
hrtd-worker-1   0/1     ContainerCreating   0          2s


In [13]:
! kubectl get events --sort-by='.lastTimestamp' | tail

3s          Normal    Scheduled                 pod/hrtd-worker-1                                                          Successfully assigned ekemini/hrtd-worker-1 to ip-10-0-19-4.ec2.internal
3s          Normal    Scheduled                 pod/hrtd-worker-0                                                          Successfully assigned ekemini/hrtd-worker-0 to ip-10-0-10-33.ec2.internal
3s          Normal    SuccessfulCreatePod       tfjob/hrtd                                                                 Created pod: hrtd-worker-1
3s          Normal    SuccessfulCreateService   tfjob/hrtd                                                                 Created service: hrtd-worker-0
2s          Normal    Pulled                    pod/hrtd-worker-1                                                          Container image "mavencodevv/tfjob_heart:v.0.1" already present on machine
2s          Normal    Started                   pod/hrtd-worker-0                                        

To stream logs from the worker-0 pod to check the training progress, run the following command:

In [14]:
! kubectl logs -f hrtd-worker-0

2021-04-13 05:59:56.534068: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-04-13 05:59:56.534103: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Instructions for updating:
use distribute.MultiWorkerMirroredStrategy instead
2021-04-13 05:59:58.336024: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-13 05:59:58.336264: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-04-13 05:59:58.336288: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-04-13 05:59:58.336313: I tensorflow/stream_execu

#### To delete the job, run the following command:

In [24]:
#! kubectl delete $TF_JOB

#### Check to see if the check to see if the pod is still up and running

In [16]:
#! kubectl -n ekemini logs -f hrtd