This code is an example of using multi-gpu of tensor flow.
I used cifar-10 model and if you want to increase the performance, you have to  adjust the image size, use inception model or vgg model, or you will get better performance if you use image argumentation. <br></br>

if you want to see original code [here][1]
[1]:  https://github.com/petewarden/tensorflow_makefile/blob/master/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py

In [6]:
import cv2
import numpy as np
from tqdm import tqdm
import pandas as pd
import os
from datetime import datetime
import tensorflow as tf
import time
import matplotlib.pyplot as plt
from scipy.misc import imread, imresize

In [None]:
df = pd.read_csv('labels.csv')
df.head()

In [None]:
n = len(df)
breed = set(df['breed'])
n_class = len(breed)
class_to_num = dict(zip(breed, range(n_class)))
num_to_class = dict(zip(range(n_class), breed))

In [None]:
IMAGE_SIZE = 32
X = np.zeros((n, IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.uint8)
y = np.zeros((n, n_class), dtype=np.uint8)
for i in tqdm(range(n)):
    X[i] = cv2.resize(cv2.imread('train/%s.jpg' % df['id'][i]), (IMAGE_SIZE, IMAGE_SIZE))
    y[i][class_to_num[df['breed'][i]]] = 1


In [None]:
df_test = pd.read_csv('sample_submission.csv')
NUMBER_TEST = len(df_test)
X_test = np.zeros((NUMBER_TEST, IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.uint8)
for i in tqdm(range(NUMBER_TEST)):
    X_test[i] = imresize(imread('../input/test/%s.jpg' % df_test['id'][i]), (IMAGE_SIZE, IMAGE_SIZE)) 

### Image argumentation
I changed the range of pixels only.

In [None]:
X = np.array(X, np.float32) / 255.
X_test = np.array(X_test,np.float32) / 255.

### Cifar-10 model build
you use tf.get_variable() in order to share variables across multiple GPU.

In [None]:
def _variable_on_cpu(name, shape, initializer):
    with tf.device('/cpu:0'):
        dtype = tf.float32
        var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
    return var

def _variable_with_weight_decay(name, shape, stddev, wd):
    dtype = tf.float32
    var = _variable_on_cpu(name,shape,
                           tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
    if wd is not None:
        weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
        tf.add_to_collection('losses', weight_decay)
    return var


In [None]:
def inference(images):

    with tf.variable_scope('conv1') as scope:
        kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64], stddev=5e-2, wd=0.0)
        conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(pre_activation, name=scope.name)

    pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
                           padding='SAME', name='pool1')
    norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                      name='norm1')

    with tf.variable_scope('conv2') as scope:
        kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64], stddev=5e-2, wd=0.0)
        conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(pre_activation, name=scope.name)

    norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2')
    pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
                           strides=[1, 2, 2, 1], padding='SAME', name='pool2')

    with tf.variable_scope('fc1') as scope:
        reshape = tf.reshape(pool2, [-1, 8*8*64])
        dim = reshape.get_shape()[1].value
        weights = _variable_with_weight_decay('weights', shape=[dim, 384],
                                          stddev=0.04, wd=0.004)
        biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
        local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)

    with tf.variable_scope('fc2') as scope:
        weights = _variable_with_weight_decay('weights', shape=[384, 192],
                                          stddev=0.04, wd=0.004)
        biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
        local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)

    with tf.variable_scope('fc3') as scope:
        weights = _variable_with_weight_decay('weights', [192, 120],
                                          stddev=1/192.0, wd=0.0)
        biases = _variable_on_cpu('biases', [120],
                              tf.constant_initializer(0.0))
        logits = tf.add(tf.matmul(local4, weights), biases, name=scope.name)

    return logits


### Multi-gpu training
* set parameter : if you use multi-gpu, change NUMBER_GPU value. <br></br>

This is example, so I only trained 50 step.

In [None]:
NUMBER_GPU = 1
NUMBER_TRAIN = int(np.shape(X)[0])
MAX_STEP = 50
BATCH_SIZE = 32
TOWER_NAME = 'tower'
IMAGE_SIZE = 32
CHANNEL = 3
NUMBER_CLASS = 120
NUM_EPOCHS_PER_DECAY = 30.0
LEARNING_RATE_DECAY_FACTOR = 0.16
INITIAL_LEARNING_RATE = 0.01
MOVING_AVERAGE_DECAY = 0.9
RMSPROP_DECAY =0.9
RMSPROP_MOMENTUM = 0.9
RMSPROP_EPSILON = 1.0

* Add the losses assigned to each gpu.

In [None]:
def tower_loss(scope, image, label):

    logits = inference(image)
    labels = tf.cast(label, tf.int64)

    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=labels, name='cross_entropy_per_example')
    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')

    tf.add_to_collection('losses', cross_entropy_mean)

    _ = tf.add_n(tf.get_collection('losses'), name='total_loss')

    losses = tf.get_collection('losses', scope)

    total_loss = tf.add_n(losses, name='total_loss')

    return total_loss


* Calculate the mean value of the gradient.

In [None]:
def average_gradients(tower_grads):

    average_grads = []
    for grad_and_vars in zip(*tower_grads):

        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)
            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)
            # Average over the 'tower' dimension.
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_mean(grad, 0)

        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

* Training

In [None]:
train_image, train_label = X, y

with tf.Graph().as_default(), tf.device('/cpu:0'):
    global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0.1), trainable=False)
    num_batches_per_epoch = (NUMBER_TRAIN / BATCH_SIZE)
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
    
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)
    
    optimizer = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY,
                                          momentum = RMSPROP_MOMENTUM,
                                          epsilon = RMSPROP_EPSILON)
    
    randidx = np.random.randint(NUMBER_TRAIN, size= BATCH_SIZE)
    batch_xs = train_image[randidx, :]
    batch_ys = train_label[randidx, :]
    
    batch_xs = tf.convert_to_tensor(batch_xs)
    batch_ys = tf.convert_to_tensor(batch_ys)
    
    batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue([batch_xs, batch_ys], capacity=2 * NUMBER_GPU)
    
    tower_grads = []
    with tf.variable_scope(tf.get_variable_scope()):
        for i in range(NUMBER_GPU):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                    image_batch, label_batch = batch_queue.dequeue()
                    loss = tower_loss(scope, image_batch,label_batch)
                    tf.get_variable_scope().reuse_variables()
                    
                    grads = optimizer.compute_gradients(loss)
                    tower_grads.append(grads)
    grads = average_gradients(tower_grads)   
    
    apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())
    train_op = tf.group(apply_gradient_op, variables_averages_op)
    
    init = tf.global_variables_initializer()
    sess = tf.Session(config=tf.ConfigProto(
                      allow_soft_placement=True,
                      log_device_placement=False))
    
    sess.run(init)
    tf.train.start_queue_runners(sess=sess)
    
    print('Learning start.')
    for step in range(MAX_STEP):
        loss_time = time.time()
        _, loss_value = sess.run([train_op, loss])
        duration = time.time() - loss_time
        
        if step % 10 == 0 :
            num_examples_per_step = BATCH_SIZE * NUMBER_GPU
            examples_per_sec = num_examples_per_step / duration
            sec_per_batch = duration / NUMBER_GPU
            
            format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ''sec/batch)')
            print(format_str % (datetime.now(), step , loss_value, examples_per_sec, sec_per_batch))
    print('Learning finish.')  
    
    tf.get_variable_scope().reuse_variables()
    logits = inference(X_test)
    softmax_logits = tf.nn.softmax(logits)
    predict = sess.run(softmax_logits)

### Submit file

In [None]:
for b in breed:
    df_test[b] = predict[:,class_to_num[b]]

In [None]:
df_test.to_csv('predict.csv', index=None)