# Train Model on Distributed Cluster

## Define Cluster Spec

In [1]:
import tensorflow as tf

cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})

## Start Server "Task 0" (localhost:2222)

In [2]:
server0 = tf.train.Server(cluster, job_name="local", task_index=0)

print(server0)

<tensorflow.python.training.server_lib.Server object at 0x7fc0639797b8>


## Start Server "Task 1" (localhost:2223)

In [3]:
server1 = tf.train.Server(cluster, job_name="local", task_index=1)

print(server1)

<tensorflow.python.training.server_lib.Server object at 0x7fc063980f28>


## Define a Computationally-intensive TensorFlow Graph

In [13]:
import tensorflow as tf
import datetime
import numpy as np

n = 2
c1 = tf.Variable([])
c2 = tf.Variable([])

def matpow(M, n):
    if n < 1: 
        return M
    else:
        return tf.matmul(M, matpow(M, n-1))

## Assign Devices Manually 

### All CPU Devices
Note the execution time.

In [17]:
with tf.device("/job:local/task:0/cpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:local/task:1/cpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

with tf.Session("grpc://127.0.0.1:2222") as sess:
    sum = c1 + c2
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))
          

[[  8.06669482e+03   1.98840781e+04   2.59456055e+04 ...,  -1.05382949e+04
    2.17728613e+03  -1.08155969e+03]
 [ -1.37961562e+04   2.33915625e+04   1.54497617e+04 ...,   1.04375918e+04
   -1.53179043e+04   1.20189941e+04]
 [  9.67124414e+03   1.12005645e+04   2.62229941e+04 ...,  -7.41832715e+03
   -5.12368652e+02   2.34161426e+04]
 ..., 
 [ -1.59691467e+03   1.64786194e+02   1.12370107e+04 ...,   2.95887378e+03
    4.91890723e+03   6.33641113e+03]
 [ -3.10643281e+04   2.17878662e+03  -1.46538848e+04 ...,   2.06148975e+03
   -1.67480703e+04   3.95390625e+01]
 [ -7.99488135e+03  -8.46742383e+03  -4.03039727e+04 ...,  -1.61571033e+03
   -8.50556738e+03   7.80693604e+02]]
Execution time: 0:00:38.643086
[[ -3.14939355e+03   2.78959609e+04   1.18435742e+03 ...,   2.07224688e+04
   -2.66485156e+03   5.40588574e+03]
 [  6.98263867e+03  -2.49192070e+04  -6.78605029e+03 ...,   7.31186719e+03
   -8.07101904e+03  -1.76662031e+04]
 [  3.75623945e+04   1.08512695e+02   1.21396152e+04 ...,   1.534

### CPU and GPU
Note the execution time.

In [20]:
with tf.device("/job:local/task:0/gpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:local/task:1/cpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

with tf.Session("grpc://127.0.0.1:2222") as sess:
    sum = c1 + c2
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))

[[-11153.16210938  -9110.88671875 -18038.21289062 ...,  -6553.86132812
    9058.89746094   -503.43164062]
 [ -6129.02929688  25932.4375       5949.13232422 ...,    976.45605469
   -2700.015625    19676.55078125]
 [ 13264.51953125  20145.62695312  14674.35253906 ...,  -1021.64990234
     291.12854004 -14559.21289062]
 ..., 
 [-17108.984375      299.73730469 -19944.95507812 ...,  27386.06835938
    5067.86376953 -10734.08984375]
 [   482.88574219 -20873.68164062 -14998.39648438 ...,  -3522.890625
  -15539.8125     -24584.39257812]
 [ -9291.37695312   8291.29394531  -8054.63476562 ...,   5531.68554688
  -20168.13085938   5089.35205078]]
Execution time: 0:00:21.550157


### All GPU Devices
Note the execution time.

In [18]:
with tf.device("/job:local/task:0/gpu:0"):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device("/job:local/task:1/gpu:0"):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

with tf.Session("grpc://127.0.0.1:2222") as sess:
    sum = c1 + c2
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Execution time: " 
          + str(datetime.datetime.now() - start_time))

[[ 317566.59375    -600115.875       572440.875      ..., -767323.0625
  -732039.          -29518.16796875]
 [ -71233.4375     -931581.9375      122241.8359375  ...,  224599.515625
  -890797.5         553153.9375    ]
 [ 131152.078125   -680631.5625      -64531.26171875 ...,  -32352.26171875
   667255.75       -509656.5625    ]
 ..., 
 [ -13648.36816406   13166.90136719   10610.7265625  ...,   21194.78320312
   -10224.09082031   12267.98535156]
 [  -2338.078125     14536.25292969   -7216.79052734 ...,  -14411.16113281
    11336.85839844  -48624.421875  ]
 [ -15214.82519531    3822.67333984  -12475.60449219 ...,   -9641.03613281
    -5436.52050781   12227.578125  ]]
Execution time: 0:00:09.848365


### Auto-assign Device by TensorFlow (Round-Robin by Default)
Note the execution time.

In [10]:
# Let TensorFlow decide the device placement
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:0",
                                              cluster=cluster)):
    A = tf.random_normal(shape=[10000, 10000])
    c1 = matpow(A,n)

with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:1",
                                              cluster=cluster)):
    B = tf.random_normal(shape=[10000, 10000])
    c2 = matpow(B,n)

with tf.Session("grpc://127.0.0.1:2222") as sess:
    sum = c1 + c2
    start_time = datetime.datetime.now()
    print(sess.run(sum))
    print("Multi node computation time: " 
          + str(datetime.datetime.now() - start_time))

[[  3825.11035156  16507.63671875   1743.14160156 ..., -15750.22558594
   -6879.64208984  18151.4140625 ]
 [ 18579.79296875  10003.78125       192.5947876  ...,  -8365.53808594
    2745.18457031 -22178.72265625]
 [-17603.28710938  -1997.83166504 -15371.70703125 ...,  -7219.47558594
   -3515.5949707    1374.12060547]
 ..., 
 [ 12190.2578125   10308.515625    19632.62304688 ...,   5232.08691406
  -18584.37890625  14044.14257812]
 [ 32060.06640625 -14041.76269531  25165.88671875 ..., -25889.4375
  -13885.19238281    241.92675781]
 [ 16637.23828125 -25647.38476562  -4619.76611328 ...,   -319.69140625
   10786.5546875   19262.70117188]]
Multi node computation time: 0:00:06.779253
