# Train Model on Distributed Cluster

## Define Cluster Spec

In [4]:
import tensorflow as tf

cluster = tf.train.ClusterSpec(
    {"worker": ["localhost:2222","localhost:2223"]}
)

In [5]:
tf.reset_default_graph()

## Start Worker 0 (localhost:2222)

In [6]:
worker0 = tf.train.Server(cluster, 
                          job_name="worker",
                          task_index=0,
                          start=True)

print(worker0)

<tensorflow.python.training.server_lib.Server object at 0x7f81786c1978>


## Start Worker 1" (localhost:2223)

In [7]:
worker1 = tf.train.Server(cluster, 
                          job_name="worker",
                          task_index=1,
                          start=True)

print(worker1)

<tensorflow.python.training.server_lib.Server object at 0x7f817861e828>


## Define Computationally-Intensive Graph
Note that we're not assigning devices.  TensorFlow will automatically select GPU if available.

In [10]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

A = tf.random_normal(shape=[10000, 10000])
c1 = matpow(A,n)

B = tf.random_normal(shape=[10000, 10000])
c2 = matpow(B,n)

sum = c1 + c2

## Create `tf.train.MonitoredTrainingSession`
There are may `tf.train.Hook` implementations.

In [12]:
import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))

[[  1016.33886719   5086.08496094 -21478.5        ..., -23559.35742188
   -8796.45507812   2257.31713867]
 [ 22592.65625      7863.86621094  -4279.58886719 ..., -18607.12109375
      92.41992188  21984.7421875 ]
 [ -8022.40380859 -16408.69726562 -18995.96484375 ...,  -6476.82861328
   -3648.82128906 -24430.84375   ]
 ..., 
 [  8047.33935547 -28624.1875      -6854.82128906 ..., -32593.359375
   29987.5546875    9121.9296875 ]
 [ -9352.12109375  31952.13085938  15410.90820312 ...,  -3708.57177734
   37230.53515625  -2356.59399414]
 [   842.046875     6426.86816406    931.88232422 ..., -14624.67578125
   -1867.80810547  -3675.32055664]]
Execution time: 0:00:03.744440


## Execute Graph on Manually-Assigned Devices 

### All CPU Devices
Note the execution time.

In [13]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

with tf.device("/job:worker/task:0/cpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:worker/task:1/cpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

sum = c1 + c2

import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          

[[  -509.796875     1314.65673828  23200.33984375 ...,  -3905.23828125
    7695.48291016   -202.14257812]
 [ -4774.43359375  -2562.40136719  11767.11523438 ...,   8130.13916016
   16203.84277344  18745.92578125]
 [ -1529.10449219  -3331.62255859  -4387.42773438 ...,  -5168.890625
    5861.85888672 -16078.02050781]
 ..., 
 [-29718.00976562 -20533.46875      9069.92089844 ..., -18745.94335938
  -12319.10351562   7450.47265625]
 [-13714.77832031  -4216.63867188  14876.51660156 ...,  17193.30859375
    7786.05810547   9686.54980469]
 [ 13452.03320312   1807.39306641 -20156.81445312 ...,  28342.42382812
   19215.2734375    7902.58642578]]
Execution time: 0:00:30.404982


### CPU and GPU
Note the reduced execution time from the all-CPU execution.

In [14]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

with tf.device("/job:worker/task:0/gpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:worker/task:1/cpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

sum = c1 + c2

import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          

[[  1294.29589844 -11670.43945312  -2398.37841797 ...,  25865.5390625
   -5128.17919922 -15327.59472656]
 [  1014.58007812 -12306.48925781  -7836.98291016 ...,   -387.49804688
    5531.7578125   -1679.28503418]
 [-11163.03222656 -19190.5703125   -5518.64355469 ..., -11151.46679688
   -9940.95214844 -13142.30175781]
 ..., 
 [ 13368.81054688  11029.58300781  -8992.62890625 ...,   5036.75244141
   -8550.47460938     28.94140625]
 [   985.68408203  12802.31933594 -11027.72851562 ...,  10220.23242188
   19238.91015625  16036.50683594]
 [-18154.91796875 -12422.08300781  -5359.39648438 ..., -21455.4609375
  -14445.66015625 -26871.25      ]]
Execution time: 0:00:17.447517


### All GPU Devices
Note the execution time is slower than when we didn't specify devices.  Why is that?

In [15]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

with tf.device("/job:worker/task:0/gpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:worker/task:1/gpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

sum = c1 + c2

import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          

[[  1602.25048828  -3084.27905273  10950.75585938 ...,   2505.19482422
   -5372.36816406  -5412.41259766]
 [ 17306.75390625  14299.77539062  -6978.29052734 ...,   4840.109375
   -7943.2578125    6473.67773438]
 [ 21306.8515625  -21384.18359375 -19334.47265625 ...,   2961.40869141
   -6882.56689453   2505.97949219]
 ..., 
 [-11346.84960938  -3797.17773438 -17888.81640625 ...,   1532.88037109
   -4650.39355469  10317.73730469]
 [ -1885.30297852    487.67919922 -36493.9453125  ...,    751.8651123
  -12599.59570312  -8101.55029297]
 [-35922.453125    13188.16113281  -5478.03710938 ...,    338.32543945
    8220.47460938   -179.34619141]]
Execution time: 0:00:06.655171


## Execute Graph with Auto-Assigned Devices 
`tf.train.replica_device_setter()` uses round-robin by default. Note the execution time.

In [32]:
import tensorflow as tf

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:0",
                                              cluster=cluster)):   
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:1",                           cluster=cluster)):    
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

sum = c1 + c2

import tensorflow as tf
import datetime

with tf.train.MonitoredTrainingSession(worker0.target,
                                       is_chief=True) as sess:
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          

[[-33297.84375    -20306.92773438 -27761.7890625  ..., -42657.953125
    8224.44726562   5156.65625   ]
 [    75.38378906   3666.71875     -1740.1105957  ...,  -8287.85546875
   -6599.88183594  -1797.16503906]
 [-22654.90429688 -27462.53125    -15845.95214844 ...,   3172.30493164
  -25671.390625     5785.87597656]
 ..., 
 [-11659.15820312 -12027.22265625 -15839.97851562 ...,   5913.44677734
   -1020.65478516   8333.6796875 ]
 [  -987.72363281  -4550.59375    -19268.74414062 ...,  14845.99609375
   10885.14453125 -20765.82421875]
 [  -144.18981934 -30101.40429688  14654.04296875 ...,   6185.74023438
   -2888.77050781   4643.55126953]]
Execution time: 0:00:03.805252
