# Train Model on Distributed Cluster

## Define Cluster Spec

In [None]:
import tensorflow as tf

cluster = tf.train.ClusterSpec(
    {"worker": ["localhost:2222","localhost:2223"]}
)

In [None]:
tf.reset_default_graph()

## Start Worker 0 (localhost:2222)

In [None]:
worker0 = tf.train.Server(cluster, 
                          job_name="worker",
                          task_index=0,
                          start=True)

print(worker0)

## Start Worker 1" (localhost:2223)

In [None]:
worker1 = tf.train.Server(cluster, 
                          job_name="worker",
                          task_index=1,
                          start=True)

print(worker1)

## Define Computationally-Intensive Graph
Note that we're not assigning devices.  TensorFlow will automatically select GPU if available.

In [None]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

A = tf.random_normal(shape=[10000, 10000])
c1 = matpow(A,n)

B = tf.random_normal(shape=[10000, 10000])
c2 = matpow(B,n)

sum = c1 + c2

## Create `tf.train.MonitoredTrainingSession`
There are may `tf.train.Hook` implementations.

In [None]:
import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))

## Execute Graph on Manually-Assigned Devices 

### All CPU Devices
Note the execution time.

In [None]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

with tf.device("/job:worker/task:0/cpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:worker/task:1/cpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

sum = c1 + c2

import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          

### CPU and GPU
Note the reduced execution time from the all-CPU execution.

In [None]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

with tf.device("/job:worker/task:0/gpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:worker/task:1/cpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

sum = c1 + c2

import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          

### All GPU Devices
Note the execution time is slower than when we didn't specify devices.  Why is that?

In [None]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

with tf.device("/job:worker/task:0/gpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:worker/task:1/gpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

sum = c1 + c2

import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          

## Execute Graph with Auto-Assigned Devices 
`tf.train.replica_device_setter()` uses round-robin by default. Note the execution time.

In [None]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:0",
                                              cluster=cluster)):   
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:1",
                                              cluster=cluster)):    
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

sum = c1 + c2

import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          