diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..7684eed1
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,13 @@
+repos:
+-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
+    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
+    hooks:
+    -   id: yapf
+        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
+    hooks:
+    -   id: check-added-large-files
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: end-of-file-fixer
diff --git a/README.md b/README.md
index 63388821..7fb29933 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,9 @@
 
 ## Howtos
 
+### Contribute
+- Run `pre-commit run -a` before your PR, this will help to format code automatically
+
 ### Add New Evaluation Task
 
 Reference [mnist task](https://github.com/Superjomn/paddle-ce-latest-kpis/tree/master/mnist), 
diff --git a/__ocr_recognition/continuous_evaluation.py b/__ocr_recognition/continuous_evaluation.py
new file mode 100644
index 00000000..a4da1f67
--- /dev/null
+++ b/__ocr_recognition/continuous_evaluation.py
@@ -0,0 +1,12 @@
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+train_avg_loss_kpi = CostKpi('train_avg_loss', 0.2, 0)
+train_seq_err_kpi = CostKpi('train_seq_err', 0.2, 0)
+
+tracking_kpis = [
+    train_avg_loss_kpi,
+    train_seq_err_kpi,
+]
diff --git a/__ocr_recognition/crnn_ctc_model.py b/__ocr_recognition/crnn_ctc_model.py
new file mode 100644
index 00000000..df33100e
--- /dev/null
+++ b/__ocr_recognition/crnn_ctc_model.py
@@ -0,0 +1,221 @@
+import paddle.fluid as fluid
+
+
+def conv_bn_pool(input,
+                 group,
+                 out_ch,
+                 act="relu",
+                 param=None,
+                 bias=None,
+                 param_0=None,
+                 is_test=False):
+    tmp = input
+    for i in xrange(group):
+        tmp = fluid.layers.conv2d(
+            input=tmp,
+            num_filters=out_ch[i],
+            filter_size=3,
+            padding=1,
+            param_attr=param if param_0 is None else param_0,
+            act=None,  # LinearActivation
+            use_cudnn=True)
+        tmp = fluid.layers.batch_norm(
+            input=tmp,
+            act=act,
+            param_attr=param,
+            bias_attr=bias,
+            is_test=is_test)
+    tmp = fluid.layers.pool2d(
+        input=tmp,
+        pool_size=2,
+        pool_type='max',
+        pool_stride=2,
+        use_cudnn=True,
+        ceil_mode=True)
+
+    return tmp
+
+
+def ocr_convs(input,
+              num,
+              with_bn,
+              regularizer=None,
+              gradient_clip=None,
+              is_test=False):
+    assert (num % 4 == 0)
+
+    b = fluid.ParamAttr(
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        initializer=fluid.initializer.Normal(0.0, 0.0))
+    w0 = fluid.ParamAttr(
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        initializer=fluid.initializer.Normal(0.0, 0.0005))
+    w1 = fluid.ParamAttr(
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        initializer=fluid.initializer.Normal(0.0, 0.01))
+    tmp = input
+    tmp = conv_bn_pool(
+        tmp, 2, [16, 16], param=w1, bias=b, param_0=w0, is_test=is_test)
+
+    tmp = conv_bn_pool(tmp, 2, [32, 32], param=w1, bias=b, is_test=is_test)
+    tmp = conv_bn_pool(tmp, 2, [64, 64], param=w1, bias=b, is_test=is_test)
+    tmp = conv_bn_pool(tmp, 2, [128, 128], param=w1, bias=b, is_test=is_test)
+    return tmp
+
+
+def encoder_net(images,
+                num_classes,
+                rnn_hidden_size=200,
+                regularizer=None,
+                gradient_clip=None,
+                is_test=False):
+    conv_features = ocr_convs(
+        images,
+        8,
+        True,
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        is_test=is_test)
+    sliced_feature = fluid.layers.im2sequence(
+        input=conv_features,
+        stride=[1, 1],
+        filter_size=[conv_features.shape[2], 1])
+
+    para_attr = fluid.ParamAttr(
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        initializer=fluid.initializer.Normal(0.0, 0.02))
+    bias_attr = fluid.ParamAttr(
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        initializer=fluid.initializer.Normal(0.0, 0.02),
+        learning_rate=2.0)
+    bias_attr_nobias = fluid.ParamAttr(
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        initializer=fluid.initializer.Normal(0.0, 0.02))
+
+    fc_1 = fluid.layers.fc(input=sliced_feature,
+                           size=rnn_hidden_size * 3,
+                           param_attr=para_attr,
+                           bias_attr=bias_attr_nobias)
+    fc_2 = fluid.layers.fc(input=sliced_feature,
+                           size=rnn_hidden_size * 3,
+                           param_attr=para_attr,
+                           bias_attr=bias_attr_nobias)
+
+    gru_forward = fluid.layers.dynamic_gru(
+        input=fc_1,
+        size=rnn_hidden_size,
+        param_attr=para_attr,
+        bias_attr=bias_attr,
+        candidate_activation='relu')
+    gru_backward = fluid.layers.dynamic_gru(
+        input=fc_2,
+        size=rnn_hidden_size,
+        is_reverse=True,
+        param_attr=para_attr,
+        bias_attr=bias_attr,
+        candidate_activation='relu')
+
+    w_attr = fluid.ParamAttr(
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        initializer=fluid.initializer.Normal(0.0, 0.02))
+    b_attr = fluid.ParamAttr(
+        regularizer=regularizer,
+        gradient_clip=gradient_clip,
+        initializer=fluid.initializer.Normal(0.0, 0.0))
+
+    fc_out = fluid.layers.fc(input=[gru_forward, gru_backward],
+                             size=num_classes + 1,
+                             param_attr=w_attr,
+                             bias_attr=b_attr)
+
+    return fc_out
+
+
+def ctc_train_net(images, label, args, num_classes):
+    regularizer = fluid.regularizer.L2Decay(args.l2)
+    gradient_clip = None
+    if args.parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places, use_nccl=True)
+        with pd.do():
+            images_ = pd.read_input(images)
+            label_ = pd.read_input(label)
+
+            fc_out = encoder_net(
+                images_,
+                num_classes,
+                regularizer=regularizer,
+                gradient_clip=gradient_clip)
+
+            cost = fluid.layers.warpctc(
+                input=fc_out,
+                label=label_,
+                blank=num_classes,
+                norm_by_times=True)
+            sum_cost = fluid.layers.reduce_sum(cost)
+
+            decoded_out = fluid.layers.ctc_greedy_decoder(
+                input=fc_out, blank=num_classes)
+
+            pd.write_output(sum_cost)
+            pd.write_output(decoded_out)
+
+        sum_cost, decoded_out = pd()
+        sum_cost = fluid.layers.reduce_sum(sum_cost)
+
+    else:
+        fc_out = encoder_net(
+            images,
+            num_classes,
+            regularizer=regularizer,
+            gradient_clip=gradient_clip)
+
+        cost = fluid.layers.warpctc(
+            input=fc_out, label=label, blank=num_classes, norm_by_times=True)
+        sum_cost = fluid.layers.reduce_sum(cost)
+        decoded_out = fluid.layers.ctc_greedy_decoder(
+            input=fc_out, blank=num_classes)
+
+    casted_label = fluid.layers.cast(x=label, dtype='int64')
+    error_evaluator = fluid.evaluator.EditDistance(
+        input=decoded_out, label=casted_label)
+
+    inference_program = fluid.default_main_program().clone(for_test=True)
+
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=args.learning_rate, momentum=args.momentum)
+    _, params_grads = optimizer.minimize(sum_cost)
+    model_average = fluid.optimizer.ModelAverage(
+        args.average_window,
+        params_grads,
+        min_average_window=args.min_average_window,
+        max_average_window=args.max_average_window)
+
+    return sum_cost, error_evaluator, inference_program, model_average
+
+
+def ctc_infer(images, num_classes):
+    fc_out = encoder_net(images, num_classes, is_test=True)
+    return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes)
+
+
+def ctc_eval(images, label, num_classes):
+    fc_out = encoder_net(images, num_classes, is_test=True)
+    decoded_out = fluid.layers.ctc_greedy_decoder(
+        input=fc_out, blank=num_classes)
+
+    casted_label = fluid.layers.cast(x=label, dtype='int64')
+    error_evaluator = fluid.evaluator.EditDistance(
+        input=decoded_out, label=casted_label)
+
+    cost = fluid.layers.warpctc(
+        input=fc_out, label=label, blank=num_classes, norm_by_times=True)
+
+    return error_evaluator, cost
diff --git a/__ocr_recognition/ctc_reader.py b/__ocr_recognition/ctc_reader.py
new file mode 100644
index 00000000..5e65ef42
--- /dev/null
+++ b/__ocr_recognition/ctc_reader.py
@@ -0,0 +1,201 @@
+import os
+import cv2
+import tarfile
+import numpy as np
+from PIL import Image
+from os import path
+import paddle as paddle
+from paddle.utils.image_util import load_image
+
+NUM_CLASSES = 10784
+DATA_SHAPE = [1, 48, 512]
+
+DATA_MD5 = "1de60d54d19632022144e4e58c2637b5"
+DATA_URL = "http://cloud.dlnel.org/filepub/?uuid=df937251-3c0b-480d-9a7b-0080dfeee65c"
+CACHE_DIR_NAME = "ctc_data"
+SAVED_FILE_NAME = "data.tar.gz"
+DATA_DIR_NAME = "data"
+TRAIN_DATA_DIR_NAME = "train_images"
+TEST_DATA_DIR_NAME = "test_images"
+TRAIN_LIST_FILE_NAME = "train.list"
+TEST_LIST_FILE_NAME = "test.list"
+
+
+class DataGenerator(object):
+    def __init__(self):
+        pass
+
+    def train_reader(self, img_root_dir, img_label_list, batchsize):
+        '''
+        Reader interface for training.
+
+        :param img_root_dir: The root path of the image for training.
+        :type img_root_dir: str
+
+        :param img_label_list: The path of the <image_name, label> file for training.
+        :type img_label_list: str
+
+        '''
+
+        img_label_lines = []
+        if batchsize == 1:
+            to_file = "tmp.txt"
+            cmd = "cat " + img_label_list + " | awk '{print $1,$2,$3,$4;}' | shuf > " + to_file
+            print "cmd: " + cmd
+            os.system(cmd)
+            print "finish batch shuffle"
+            img_label_lines = open(to_file, 'r').readlines()
+        else:
+            to_file = "tmp.txt"
+            #cmd1: partial shuffle
+            cmd = "cat " + img_label_list + " | awk '{printf(\"%04d%.4f %s\\n\", $1, rand(), $0)}' | sort | sed 1,$((1 + RANDOM % 100))d | "
+            #cmd2: batch merge and shuffle
+            cmd += "awk '{printf $2\" \"$3\" \"$4\" \"$5\" \"; if(NR % " + str(
+                batchsize) + " == 0) print \"\";}' | shuf | "
+            #cmd3: batch split
+            cmd += "awk '{if(NF == " + str(
+                batchsize
+            ) + " * 4) {for(i = 0; i < " + str(
+                batchsize
+            ) + "; i++) print $(4*i+1)\" \"$(4*i+2)\" \"$(4*i+3)\" \"$(4*i+4);}}' > " + to_file
+            print "cmd: " + cmd
+            os.system(cmd)
+            print "finish batch shuffle"
+            img_label_lines = open(to_file, 'r').readlines()
+
+        def reader():
+            sizes = len(img_label_lines) / batchsize
+            for i in range(sizes):
+                result = []
+                sz = [0, 0]
+                for j in range(batchsize):
+                    line = img_label_lines[i * batchsize + j]
+                    # h, w, img_name, labels
+                    items = line.split(' ')
+
+                    label = [int(c) for c in items[-1].split(',')]
+                    img = Image.open(os.path.join(img_root_dir, items[
+                        2])).convert('L')  #zhuanhuidu
+                    if j == 0:
+                        sz = img.size
+                    img = img.resize((sz[0], sz[1]))
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    result.append([img, label])
+                yield result
+
+        return reader
+
+    def test_reader(self, img_root_dir, img_label_list):
+        '''
+        Reader interface for inference.
+
+        :param img_root_dir: The root path of the images for training.
+        :type img_root_dir: str
+
+        :param img_label_list: The path of the <image_name, label> file for testing.
+        :type img_label_list: str
+        '''
+
+        def reader():
+            for line in open(img_label_list):
+                # h, w, img_name, labels
+                items = line.split(' ')
+
+                label = [int(c) for c in items[-1].split(',')]
+                img = Image.open(os.path.join(img_root_dir, items[2])).convert(
+                    'L')
+                img = np.array(img) - 127.5
+                img = img[np.newaxis, ...]
+                yield img, label
+
+        return reader
+
+    def infer_reader(self, img_root_dir=None, img_label_list=None):
+        '''A reader interface for inference.
+
+        :param img_root_dir: The root path of the images for training.
+        :type img_root_dir: str
+
+        :param img_label_list: The path of the <image_name, label> file for
+        inference. It should be the path of <image_path> file if img_root_dir
+        was None. If img_label_list was set to None, it will read image path
+        from stdin.
+        :type img_root_dir: str
+        '''
+
+        def reader():
+            if img_label_list is not None:
+                for line in open(img_label_list):
+                    if img_root_dir is not None:
+                        # h, w, img_name, labels
+                        img_name = line.split(' ')[2]
+                        img_path = os.path.join(img_root_dir, img_name)
+                    else:
+                        img_path = line.strip("\t\n\r")
+                    img = Image.open(img_path).convert('L')
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    yield img, label
+            else:
+                while True:
+                    img_path = raw_input("Please input the path of image: ")
+                    img = Image.open(img_path).convert('L')
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    yield img, [[0]]
+
+        return reader
+
+
+def num_classes():
+    '''Get classes number of this dataset.
+    '''
+    return NUM_CLASSES
+
+
+def data_shape():
+    '''Get image shape of this dataset. It is a dummy shape for this dataset.
+    '''
+    return DATA_SHAPE
+
+
+def train(batch_size, train_images_dir=None, train_list_file=None):
+    generator = DataGenerator()
+    if train_images_dir is None:
+        data_dir = download_data()
+        train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME)
+    if train_list_file is None:
+        train_list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME)
+    return generator.train_reader(train_images_dir, train_list_file,
+                                  batch_size)
+
+
+def test(batch_size=1, test_images_dir=None, test_list_file=None):
+    generator = DataGenerator()
+    if test_images_dir is None:
+        data_dir = download_data()
+        test_images_dir = path.join(data_dir, TEST_DATA_DIR_NAME)
+    if test_list_file is None:
+        test_list_file = path.join(data_dir, TEST_LIST_FILE_NAME)
+    return paddle.batch(
+        generator.test_reader(test_images_dir, test_list_file), batch_size)
+
+
+def inference(infer_images_dir=None, infer_list_file=None):
+    generator = DataGenerator()
+    return paddle.batch(
+        generator.infer_reader(infer_images_dir, infer_list_file), 1)
+
+
+def download_data():
+    '''Download train and test data.
+    '''
+    tar_file = paddle.dataset.common.download(
+        DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME)
+    data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME)
+    if not path.isdir(data_dir):
+        t = tarfile.open(tar_file, "r:gz")
+        t.extractall(path=path.dirname(tar_file))
+        t.close()
+    return data_dir
diff --git a/__ocr_recognition/ctc_train.py b/__ocr_recognition/ctc_train.py
new file mode 100644
index 00000000..43fcd13e
--- /dev/null
+++ b/__ocr_recognition/ctc_train.py
@@ -0,0 +1,138 @@
+"""Trainer for OCR CTC model."""
+import paddle.fluid as fluid
+from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
+from crnn_ctc_model import ctc_train_net
+import ctc_reader
+import argparse
+import functools
+import sys
+import time
+import os
+import numpy as np
+from continuous_evaluation import train_avg_loss_kpi, train_seq_err_kpi
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   32,         "Minibatch size.")
+add_arg('pass_num',          int,   100,        "Number of training epochs.")
+add_arg('log_period',        int,   100,       "Log period.")
+add_arg('iterations',        int,   10000,       "the total iterations.")
+add_arg('save_model_period', int,   15000,      "Save model period. '-1' means never saving the model.")
+add_arg('eval_period',       int,   15000,      "Evaluate period. '-1' means never evaluating the model.")
+add_arg('save_model_dir',    str,   "./models", "The directory the model to be saved to.")
+add_arg('init_model',        str,   None,       "The init model file of directory.")
+add_arg('learning_rate',     float, 1.0e-3,    "Learning rate.")
+add_arg('l2',                float, 0.0004,    "L2 regularizer.")
+add_arg('momentum',          float, 0.9,       "Momentum.")
+add_arg('rnn_hidden_size',   int,   200,       "Hidden size of rnn layers.")
+add_arg('use_gpu',           bool,  True,      "Whether use GPU to train.")
+add_arg('min_average_window',int,   10000,     "Min average window.")
+add_arg('max_average_window',int,   15625,     "Max average window. It is proposed to be set as the number of minibatch in a pass.")
+add_arg('average_window',    float, 0.15,      "Average window.")
+add_arg('parallel',          bool,  False,     "Whether use parallel training.")
+add_arg('train_images',      str,   None,    "The directory of training images."
+        "None means using the default training images of reader.")
+add_arg('train_list',        str,   None,    "The list file of training images."
+        "None means using the default train_list file of reader.")
+add_arg('test_images',      str,    None,    "The directory of training images."
+        "None means using the default test images of reader.")
+add_arg('test_list',        str,    None,   "The list file of training images."
+        "None means using the default test_list file of reader.")
+add_arg('num_classes',      int,    None,      "The number of classes."
+        "None means using the default num_classes from reader.")
+# yapf: enable
+
+
+def train(args, data_reader=ctc_reader):
+    """OCR CTC training"""
+    num_classes = data_reader.num_classes(
+    ) if args.num_classes is None else args.num_classes
+    data_shape = data_reader.data_shape()
+    # define network
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(
+        name='label', shape=[1], dtype='int32', lod_level=1)
+    sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(
+        images, label, args, num_classes)
+
+    # data reader
+    train_reader = data_reader.train(
+        args.batch_size,
+        train_images_dir=args.train_images,
+        train_list_file=args.train_list)
+    test_reader = data_reader.test(
+        test_images_dir=args.test_images, test_list_file=args.test_list)
+
+    # prepare environment
+    place = fluid.CPUPlace()
+    if args.use_gpu:
+        place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    # load init model
+    if args.init_model is not None:
+        model_dir = args.init_model
+        model_file_name = None
+        if not os.path.isdir(args.init_model):
+            model_dir = os.path.dirname(args.init_model)
+            model_file_name = os.path.basename(args.init_model)
+        fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
+        print "Init model from: %s." % args.init_model
+
+    for pass_id in range(args.pass_num):
+        error_evaluator.reset(exe)
+        batch_id = 1
+        total_loss = 0.0
+        total_seq_error = 0.0
+        # train a pass
+        for data in train_reader():
+            batch_loss, _, batch_seq_error = exe.run(
+                fluid.default_main_program(),
+                feed=get_feeder_data(data, place),
+                fetch_list=[sum_cost] + error_evaluator.metrics)
+            total_loss += batch_loss[0]
+            total_seq_error += batch_seq_error[0]
+            # training log
+            if batch_id % args.log_period == 0:
+                print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % (
+                    time.time(), pass_id, batch_id,
+                    total_loss / (batch_id * args.batch_size),
+                    total_seq_error / (batch_id * args.batch_size))
+                sys.stdout.flush()
+            if batch_id == args.iterations - 1:
+                avg_seq_err = batch_seq_error[0] / args.batch_size
+                avg_loss = batch_loss[0] / args.batch_size
+                train_avg_loss_kpi.add_record(
+                    np.array(
+                        avg_loss, dtype='float32'))
+                train_seq_err_kpi.add_record(
+                    np.array(
+                        avg_seq_err, dtype='float32'))
+                break
+            # evaluate
+            if batch_id % args.eval_period == 0:
+                with model_average.apply(exe):
+                    error_evaluator.reset(exe)
+                    for data in test_reader():
+                        exe.run(inference_program,
+                                feed=get_feeder_data(data, place))
+                    _, test_seq_error = error_evaluator.eval(exe)
+
+                    print "\nTime: %s; Pass[%d]-batch[%d]; Test seq error: %s.\n" % (
+                        time.time(), pass_id, batch_id, str(test_seq_error[0]))
+
+            batch_id += 1
+    train_avg_loss_kpi.persist()
+    train_seq_err_kpi.persist()
+
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    train(args, data_reader=ctc_reader)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/__ocr_recognition/eval.py b/__ocr_recognition/eval.py
new file mode 100644
index 00000000..1c33ff36
--- /dev/null
+++ b/__ocr_recognition/eval.py
@@ -0,0 +1,71 @@
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
+from crnn_ctc_model import ctc_infer
+from crnn_ctc_model import ctc_eval
+import ctc_reader
+import argparse
+import functools
+import os
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('model_path',         str,  None,   "The model path to be used for inference.")
+add_arg('input_images_dir',   str,  None,   "The directory of images.")
+add_arg('input_images_list',  str,  None,   "The list file of images.")
+add_arg('use_gpu',            bool,  True,      "Whether use GPU to eval.")
+# yapf: enable
+
+
+def evaluate(args, eval=ctc_eval, data_reader=ctc_reader):
+    """OCR inference"""
+    num_classes = data_reader.num_classes()
+    data_shape = data_reader.data_shape()
+    # define network
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(
+        name='label', shape=[1], dtype='int32', lod_level=1)
+    evaluator, cost = eval(images, label, num_classes)
+
+    # data reader
+    test_reader = data_reader.test(
+        test_images_dir=args.input_images_dir,
+        test_list_file=args.input_images_list)
+
+    # prepare environment
+    place = fluid.CPUPlace()
+    if use_gpu:
+        place = fluid.CUDAPlace(0)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    # load init model
+    model_dir = args.model_path
+    model_file_name = None
+    if not os.path.isdir(args.model_path):
+        model_dir = os.path.dirname(args.model_path)
+        model_file_name = os.path.basename(args.model_path)
+    fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
+    print "Init model from: %s." % args.model_path
+
+    evaluator.reset(exe)
+    count = 0
+    for data in test_reader():
+        count += 1
+        exe.run(fluid.default_main_program(),
+                feed=get_feeder_data(data, place))
+    avg_distance, avg_seq_error = evaluator.eval(exe)
+    print "Read %d samples; avg_distance: %s; avg_seq_error: %s" % (
+        count, avg_distance, avg_seq_error)
+
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    evaluate(args, data_reader=ctc_reader)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/__ocr_recognition/images/demo.jpg b/__ocr_recognition/images/demo.jpg
new file mode 100644
index 00000000..be5aee50
Binary files /dev/null and b/__ocr_recognition/images/demo.jpg differ
diff --git a/__ocr_recognition/images/train.jpg b/__ocr_recognition/images/train.jpg
new file mode 100644
index 00000000..3d691f1c
Binary files /dev/null and b/__ocr_recognition/images/train.jpg differ
diff --git a/__ocr_recognition/inference.py b/__ocr_recognition/inference.py
new file mode 100644
index 00000000..04175bb1
--- /dev/null
+++ b/__ocr_recognition/inference.py
@@ -0,0 +1,65 @@
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
+from crnn_ctc_model import ctc_infer
+import numpy as np
+import ctc_reader
+import argparse
+import functools
+import os
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('model_path',         str,  None,   "The model path to be used for inference.")
+add_arg('input_images_dir',   str,  None,   "The directory of images.")
+add_arg('input_images_list',  str,  None,   "The list file of images.")
+add_arg('use_gpu',            bool,  True,      "Whether use GPU to infer.")
+# yapf: enable
+
+
+def inference(args, infer=ctc_infer, data_reader=ctc_reader):
+    """OCR inference"""
+    num_classes = data_reader.num_classes()
+    data_shape = data_reader.data_shape()
+    # define network
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    sequence = infer(images, num_classes)
+    # data reader
+    infer_reader = data_reader.inference(
+        infer_images_dir=args.input_images_dir,
+        infer_list_file=args.input_images_list)
+    # prepare environment
+    place = fluid.CPUPlace()
+    if use_gpu:
+        place = fluid.CUDAPlace(0)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    # load init model
+    model_dir = args.model_path
+    model_file_name = None
+    if not os.path.isdir(args.model_path):
+        model_dir = os.path.dirname(args.model_path)
+        model_file_name = os.path.basename(args.model_path)
+    fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
+    print "Init model from: %s." % args.model_path
+
+    for data in infer_reader():
+        result = exe.run(fluid.default_main_program(),
+                         feed=get_feeder_data(
+                             data, place, need_label=False),
+                         fetch_list=[sequence],
+                         return_numpy=False)
+        print "result: %s" % (np.array(result[0]).flatten(), )
+
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    inference(args, data_reader=ctc_reader)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/__ocr_recognition/latest_kpis/train_avg_loss_factor.txt b/__ocr_recognition/latest_kpis/train_avg_loss_factor.txt
new file mode 100644
index 00000000..e1a834b0
--- /dev/null
+++ b/__ocr_recognition/latest_kpis/train_avg_loss_factor.txt
@@ -0,0 +1 @@
+[8196.62353515625]
diff --git a/__ocr_recognition/latest_kpis/train_seq_err_factor.txt b/__ocr_recognition/latest_kpis/train_seq_err_factor.txt
new file mode 100644
index 00000000..07787fb2
--- /dev/null
+++ b/__ocr_recognition/latest_kpis/train_seq_err_factor.txt
@@ -0,0 +1 @@
+[828.0]
diff --git a/__ocr_recognition/run.xsh b/__ocr_recognition/run.xsh
new file mode 100755
index 00000000..ebbe41c7
--- /dev/null
+++ b/__ocr_recognition/run.xsh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+cudaid=${ocr_recognition_cudaid:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.9 python ctc_train.py --use_gpu=True --batch_size=128 --pass_num=1  --iterations=3000
diff --git a/__ocr_recognition/utility.py b/__ocr_recognition/utility.py
new file mode 100644
index 00000000..67a5bfa0
--- /dev/null
+++ b/__ocr_recognition/utility.py
@@ -0,0 +1,90 @@
+"""Contains common utility functions."""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import numpy as np
+from paddle.fluid import core
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).iteritems()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int32")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def get_feeder_data(data, place, need_label=True):
+    pixel_tensor = core.LoDTensor()
+    pixel_data = None
+    pixel_data = np.concatenate(
+        map(lambda x: x[0][np.newaxis, :], data), axis=0).astype("float32")
+    pixel_tensor.set(pixel_data, place)
+    label_tensor = to_lodtensor(map(lambda x: x[1], data), place)
+    if need_label:
+        return {"pixel": pixel_tensor, "label": label_tensor}
+    else:
+        return {"pixel": pixel_tensor}
diff --git a/resnet30/__init__.py b/__resnet30/__init__.py
similarity index 100%
rename from resnet30/__init__.py
rename to __resnet30/__init__.py
diff --git a/__resnet30/continuous_evaluation.py b/__resnet30/continuous_evaluation.py
new file mode 100644
index 00000000..0ac5f0b4
--- /dev/null
+++ b/__resnet30/continuous_evaluation.py
@@ -0,0 +1,19 @@
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, AccKpi, DurationKpi
+
+train_cost_kpi = CostKpi('train_cost', 0.05, 0, actived=True)
+train_acc_kpi = AccKpi('train_acc', 0.02, 0, actived=True)  
+test_acc_kpi = AccKpi('test_acc', 0.05, 0, actived=True)  
+train_speed_kpi = AccKpi('train_speed', 0.01, 0, actived=True)  
+train_duration_kpi = DurationKpi('train_duration', 0.02, 0, actived=True)
+
+
+tracking_kpis = [
+    train_cost_kpi,
+    train_acc_kpi,
+    test_acc_kpi,
+    train_speed_kpi,
+    train_duration_kpi,
+]
diff --git a/resnet30/history/train_cost_factor.txt b/__resnet30/history/train_cost_factor.txt
similarity index 79%
rename from resnet30/history/train_cost_factor.txt
rename to __resnet30/history/train_cost_factor.txt
index cf262792..c46c47b0 100644
--- a/resnet30/history/train_cost_factor.txt
+++ b/__resnet30/history/train_cost_factor.txt
@@ -2,4 +2,4 @@
 [[2.744691848754883]]
 [[2.5916006565093994]]
 [[2.459857225418091]]
-[[2.3514037132263184]]
\ No newline at end of file
+[[2.3514037132263184]]
diff --git a/resnet30/history/train_duration_factor.txt b/__resnet30/history/train_duration_factor.txt
similarity index 80%
rename from resnet30/history/train_duration_factor.txt
rename to __resnet30/history/train_duration_factor.txt
index f5c4f294..ebc0b54b 100644
--- a/resnet30/history/train_duration_factor.txt
+++ b/__resnet30/history/train_duration_factor.txt
@@ -2,4 +2,4 @@
 [10.211545944213867]
 [10.223276853561401]
 [10.213245153427124]
-[10.241420984268188]
\ No newline at end of file
+[10.241420984268188]
diff --git a/__resnet30/latest_kpis/test_acc_factor.txt b/__resnet30/latest_kpis/test_acc_factor.txt
new file mode 100644
index 00000000..d7046f5b
--- /dev/null
+++ b/__resnet30/latest_kpis/test_acc_factor.txt
@@ -0,0 +1 @@
+[0.459300000667572]
diff --git a/__resnet30/latest_kpis/train_acc_factor.txt b/__resnet30/latest_kpis/train_acc_factor.txt
new file mode 100644
index 00000000..bab1e137
--- /dev/null
+++ b/__resnet30/latest_kpis/train_acc_factor.txt
@@ -0,0 +1 @@
+[0.56150390625]
diff --git a/__resnet30/latest_kpis/train_cost_factor.txt b/__resnet30/latest_kpis/train_cost_factor.txt
new file mode 100644
index 00000000..c1286410
--- /dev/null
+++ b/__resnet30/latest_kpis/train_cost_factor.txt
@@ -0,0 +1,10 @@
+[[3.121091365814209]]
+[[2.9679136276245117]]
+[[2.664355516433716]]
+[[2.5711519718170166]]
+[[2.484081745147705]]
+[[2.44614839553833]]
+[[2.416034460067749]]
+[[2.4315545558929443]]
+[[2.4579968452453613]]
+[[2.449829578399658]]
diff --git a/__resnet30/latest_kpis/train_duration_factor.txt b/__resnet30/latest_kpis/train_duration_factor.txt
new file mode 100644
index 00000000..6007f902
--- /dev/null
+++ b/__resnet30/latest_kpis/train_duration_factor.txt
@@ -0,0 +1,10 @@
+[22.25151491165161]
+[21.59505009651184]
+[21.59479784965515]
+[21.565481901168823]
+[21.499217987060547]
+[21.321773052215576]
+[21.280965089797974]
+[21.29200315475464]
+[21.28358292579651]
+[21.292808055877686]
diff --git a/__resnet30/latest_kpis/train_speed_factor.txt b/__resnet30/latest_kpis/train_speed_factor.txt
new file mode 100644
index 00000000..6c50ba81
--- /dev/null
+++ b/__resnet30/latest_kpis/train_speed_factor.txt
@@ -0,0 +1 @@
+[55.21354293823242]
diff --git a/resnet30/model.py b/__resnet30/model.py
similarity index 57%
rename from resnet30/model.py
rename to __resnet30/model.py
index 0140ab72..710d70cb 100644
--- a/resnet30/model.py
+++ b/__resnet30/model.py
@@ -11,9 +11,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-
-from continuous_evaluation import (train_cost_kpi, train_duration_kpi,
-                                   tracking_kpis)
+from continuous_evaluation import *
 
 logger = logging.getLogger(__name__)
 
@@ -85,86 +83,123 @@ def train(batch_size, device, pass_num, iterations):
     input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
+    # Train program
     predict = resnet_cifar10(input, class_dim)
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(x=cost)
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    opts = optimizer.minimize(avg_cost)
-    # accuracy = fluid.evaluator.Evaluator(input=predict, label=label)
+
+    # Evaluator
+    #accuracy = fluid.evaluator.Evaluator(input=predict, label=label)
+   
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+    accuracy = fluid.average.WeightedAverage()
 
     # inference program
     inference_program = fluid.default_main_program().clone()
     with fluid.program_guard(inference_program):
         # test_target = accuracy.metrics + accuracy.states
-        test_target = [predict, avg_cost]
-        inference_program = fluid.io.get_inference_program(test_target)
+        target_vars=[batch_acc, batch_size_tensor]
+        inference_program = fluid.io.get_inference_program(target_vars)
 
+    # Optimization
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+    opts = optimizer.minimize(avg_cost)
     fluid.memory_optimize(fluid.default_main_program())
 
     train_reader = paddle.batch(
-        paddle.dataset.cifar.train10(),
-        batch_size=batch_size)
+        paddle.dataset.cifar.train10(), batch_size=batch_size)
 
     test_reader = paddle.batch(
         paddle.dataset.cifar.test10(), batch_size=batch_size)
 
+    # Initialize executor
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
     def test(exe):
-        # accuracy.reset(exe)
+        test_accuracy = fluid.average.WeightedAverage()
         for batch_id, data in enumerate(test_reader()):
             img_data = np.array(map(lambda x: x[0].reshape(dshape),
                                     data)).astype("float32")
             y_data = np.array(map(lambda x: x[1], data)).astype("int64")
             y_data = y_data.reshape([-1, 1])
 
-            # print('image_data', img_data)
-            # print('y_data', y_data)
-
-            predict_, avg_cost_ = exe.run(
-                inference_program,
-                feed={
-                    "data": img_data,
-                    "label": y_data
-                },
-                fetch_list=[predict, avg_cost])
-            return avg_cost
+            acc, weight = exe.run(inference_program,
+                                  feed={"data": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
 
-        # return accuracy.eval(exe)
-
-    place = core.CPUPlace() if device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
+        return test_accuracy.eval()
 
-    for pass_id in range(1):
-        logger.warning('Pass {}'.format(pass_id))
-        # accuracy.reset(exe)
+    im_num = 0
+    total_train_time = 0.0
+    for pass_id in range(args.pass_num):
         iter = 0
+        every_pass_loss = []
+        accuracy.reset()
+        pass_duration = 0.0
         for batch_id, data in enumerate(train_reader()):
             logger.warning('Batch {}'.format(batch_id))
             batch_start = time.time()
             if iter == iterations:
                 break
-            image = np.array(map(lambda x: x[0].reshape(dshape),
-                                 data)).astype('float32')
+            image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
+                'float32')
             label = np.array(map(lambda x: x[1], data)).astype('int64')
             label = label.reshape([-1, 1])
-            avg_cost_ = exe.run(
+
+            loss, acc, weight = exe.run(
                 fluid.default_main_program(),
-                feed={
-                    'data': image,
-                    'label': label
-                },
-                fetch_list=[avg_cost])
+                feed={'data': image,
+                      'label': label},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+
             batch_end = time.time()
-            print('avg_cost', np.array(avg_cost_, dtype='float32'))
-            train_cost_kpi.add_record(np.array(avg_cost_, dtype='float32'))
-            train_duration_kpi.add_record(batch_end - batch_start)
+            every_pass_loss.append(loss)
+            accuracy.add(value=acc, weight=weight)
+        
+
+            if iter >= args.skip_batch_num or pass_id != 0:
+                batch_duration = time.time() - batch_start
+                pass_duration += batch_duration
+                im_num += label.shape[0]
 
             iter += 1
 
-            # test_start = time.time()
-            # test(exe)
-            # test_end = time.time()
-            # valid_tracker.add(test_end - test_start, pass_test_acc)
+            print(
+                    "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                    (pass_id, iter, loss, acc))
+        pass_train_acc = accuracy.eval()
+        pass_test_acc = test(exe)
+
+        total_train_time += pass_duration
+        pass_train_loss = np.mean(every_pass_loss) 
+        print(
+            "Pass:%d, Loss:%f, Train Accuray:%f, Test Accuray:%f, Handle Images Duration: %f\n"
+            % (pass_id, pass_train_loss, pass_train_acc,
+               pass_test_acc, pass_duration))
+    if pass_id == args.pass_num - 1:
+        train_cost_kpi.add_record(np.array(pass_train_loss, dtype='float32'))
+        train_cost_kpi.persist()
+        train_acc_kpi.add_record(np.array(pass_train_acc, dtype='float32'))
+        train_acc_kpi.persist()
+        test_acc_kpi.add_record(np.array(pass_test_acc, dtype='float32'))
+        test_acc_kpi.persist()
+        train_duration_kpi.add_record(batch_end - batch_start)
+        train_duration_kpi.persist()
+
+    if total_train_time > 0.0:
+        examples_per_sec = im_num / total_train_time
+        sec_per_batch = total_train_time / \
+            (iter * args.pass_num - args.skip_batch_num)
+        train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32'))
+        train_speed_kpi.persist()
 
 
 def parse_args():
@@ -172,6 +207,14 @@ def parse_args():
     parser.add_argument('--batch_size', type=int)
     parser.add_argument('--device', type=str, choices=('CPU', 'GPU'))
     parser.add_argument('--iters', type=int)
+    parser.add_argument(
+        '--pass_num', type=int, default=3, help='The number of passes.')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
     args = parser.parse_args()
     return args
 
diff --git a/resnet30/run.xsh b/__resnet30/run.xsh
similarity index 81%
rename from resnet30/run.xsh
rename to __resnet30/run.xsh
index 11393550..33644395 100755
--- a/resnet30/run.xsh
+++ b/__resnet30/run.xsh
@@ -9,4 +9,4 @@ import sys
 
 model_file = 'model.py'
 
-python @(model_file) --batch_size 1000 --iters 10 --device CPU
+python @(model_file) --batch_size 128 --pass_num 5 --iters 80 --device CPU
diff --git a/image_classification/continuous_evaluation.py b/image_classification/continuous_evaluation.py
new file mode 100644
index 00000000..21f3ea06
--- /dev/null
+++ b/image_classification/continuous_evaluation.py
@@ -0,0 +1,27 @@
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+train_acc_top1_kpi = AccKpi('train_acc_top1_kpi', 0.05, 0,
+                            desc='TOP1 ACC')
+train_acc_top5_kpi = AccKpi('train_acc_top5_kpi', 0.05, 0,
+                            actived=True,
+                            desc='TOP5 ACC')
+train_cost_kpi = CostKpi('train_cost_kpi', 0.05, 0,
+                        actived=True,
+                        desc='train cost')
+train_speed_kpi = AccKpi('train_speed_kpi', 0.05, 0,
+                        actived=True,
+                        unit_repr='images/s',
+                        desc='train speed in one GPU card')
+four_card_train_speed_kpi = AccKpi('four_card_train_speed_kpi', 0.05, 0,
+                        actived=True,
+                        unit_repr='images/s',
+                        desc='train speed in four GPU card')
+
+tracking_kpis = [train_acc_top1_kpi,
+                train_acc_top5_kpi,
+                train_cost_kpi,
+                train_speed_kpi,
+                four_card_train_speed_kpi]
diff --git a/image_classification/latest_kpis/four_card_train_speed_kpi_factor.txt b/image_classification/latest_kpis/four_card_train_speed_kpi_factor.txt
new file mode 100644
index 00000000..f2ce4eee
--- /dev/null
+++ b/image_classification/latest_kpis/four_card_train_speed_kpi_factor.txt
@@ -0,0 +1 @@
+[174.80782203734947]
\ No newline at end of file
diff --git a/image_classification/latest_kpis/train_acc_top1_kpi_factor.txt b/image_classification/latest_kpis/train_acc_top1_kpi_factor.txt
new file mode 100644
index 00000000..351fea16
--- /dev/null
+++ b/image_classification/latest_kpis/train_acc_top1_kpi_factor.txt
@@ -0,0 +1 @@
+[0.3767074942588806]
\ No newline at end of file
diff --git a/image_classification/latest_kpis/train_acc_top5_kpi_factor.txt b/image_classification/latest_kpis/train_acc_top5_kpi_factor.txt
new file mode 100644
index 00000000..9e1b075c
--- /dev/null
+++ b/image_classification/latest_kpis/train_acc_top5_kpi_factor.txt
@@ -0,0 +1 @@
+[0.5719688820838928]
diff --git a/image_classification/latest_kpis/train_cost_kpi_factor.txt b/image_classification/latest_kpis/train_cost_kpi_factor.txt
new file mode 100644
index 00000000..c896d6ff
--- /dev/null
+++ b/image_classification/latest_kpis/train_cost_kpi_factor.txt
@@ -0,0 +1 @@
+[2.875904941558838]
diff --git a/image_classification/latest_kpis/train_speed_kpi_factor.txt b/image_classification/latest_kpis/train_speed_kpi_factor.txt
new file mode 100644
index 00000000..6a664608
--- /dev/null
+++ b/image_classification/latest_kpis/train_speed_kpi_factor.txt
@@ -0,0 +1 @@
+[101.29667191639184]
\ No newline at end of file
diff --git a/image_classification/mobilenet.py b/image_classification/mobilenet.py
new file mode 100644
index 00000000..0a8197f1
--- /dev/null
+++ b/image_classification/mobilenet.py
@@ -0,0 +1,153 @@
+import os
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+parameter_attr = ParamAttr(initializer=MSRA())
+
+
+def conv_bn_layer(input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  num_groups=1,
+                  act='relu',
+                  use_cudnn=True):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        groups=num_groups,
+        act=None,
+        use_cudnn=use_cudnn,
+        param_attr=parameter_attr,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act)
+
+
+def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride,
+                        scale):
+    depthwise_conv = conv_bn_layer(
+        input=input,
+        filter_size=3,
+        num_filters=int(num_filters1 * scale),
+        stride=stride,
+        padding=1,
+        num_groups=int(num_groups * scale),
+        use_cudnn=False)
+
+    pointwise_conv = conv_bn_layer(
+        input=depthwise_conv,
+        filter_size=1,
+        num_filters=int(num_filters2 * scale),
+        stride=1,
+        padding=0)
+    return pointwise_conv
+
+
+def mobile_net(img, class_dim, scale=1.0):
+
+    # conv1: 112x112
+    tmp = conv_bn_layer(
+        img,
+        filter_size=3,
+        channels=3,
+        num_filters=int(32 * scale),
+        stride=2,
+        padding=1)
+
+    # 56x56
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=32,
+        num_filters2=64,
+        num_groups=32,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=64,
+        num_filters2=128,
+        num_groups=64,
+        stride=2,
+        scale=scale)
+
+    # 28x28
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=128,
+        num_filters2=128,
+        num_groups=128,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=128,
+        num_filters2=256,
+        num_groups=128,
+        stride=2,
+        scale=scale)
+
+    # 14x14
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=256,
+        num_filters2=256,
+        num_groups=256,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=256,
+        num_filters2=512,
+        num_groups=256,
+        stride=2,
+        scale=scale)
+
+    # 14x14
+    for i in range(5):
+        tmp = depthwise_separable(
+            tmp,
+            num_filters1=512,
+            num_filters2=512,
+            num_groups=512,
+            stride=1,
+            scale=scale)
+    # 7x7
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=512,
+        num_filters2=1024,
+        num_groups=512,
+        stride=2,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=1024,
+        num_filters2=1024,
+        num_groups=1024,
+        stride=1,
+        scale=scale)
+
+    tmp = fluid.layers.pool2d(
+        input=tmp,
+        pool_size=0,
+        pool_stride=1,
+        pool_type='avg',
+        global_pooling=True)
+
+    tmp = fluid.layers.fc(input=tmp,
+                          size=class_dim,
+                          act='softmax',
+                          param_attr=parameter_attr)
+    return tmp
diff --git a/image_classification/reader.py b/image_classification/reader.py
new file mode 100644
index 00000000..4061d1d3
--- /dev/null
+++ b/image_classification/reader.py
@@ -0,0 +1,164 @@
+import os
+import math
+import random
+import functools
+import numpy as np
+import paddle
+from PIL import Image, ImageEnhance
+
+random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = 8
+BUF_SIZE = 1024
+
+DATA_DIR = 'ILSVRC2012'
+TRAIN_LIST = 'ILSVRC2012/train_list.txt'
+TEST_LIST = 'ILSVRC2012/test_list.txt'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = random.randint(0, width - size)
+        h_start = random.randint(0, height - size)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = random.randint(0, img.size[0] - w)
+    j = random.randint(0, img.size[1] - h)
+
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.LANCZOS)
+    return img
+
+
+def rotate_image(img):
+    angle = random.randint(-10, 10)
+    img = img.rotate(angle)
+    return img
+
+
+def distort_color(img):
+    def random_brightness(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Brightness(img).enhance(e)
+
+    def random_contrast(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Contrast(img).enhance(e)
+
+    def random_color(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Color(img).enhance(e)
+
+    ops = [random_brightness, random_contrast, random_color]
+    random.shuffle(ops)
+
+    img = ops[0](img)
+    img = ops[1](img)
+    img = ops[2](img)
+
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+    if mode == 'train':
+        if rotate: img = rotate_image(img)
+        img = random_crop(img, DATA_DIM)
+    else:
+        img = resize_short(img, DATA_DIM)
+        img = crop_image(img, target_size=DATA_DIM, center=True)
+    if mode == 'train':
+        if color_jitter:
+            img = distort_color(img)
+        if random.randint(0, 1) == 1:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'test':
+        return img, sample[1]
+    elif mode == 'infer':
+        return [img]
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False):
+    def reader():
+        with open(file_list) as flist:
+            lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(lines)
+            for line in lines:
+                if mode == 'train' or mode == 'test':
+                    img_path, label = line.split()
+                    img_path = os.path.join(DATA_DIR, img_path)
+                    yield img_path, int(label)
+                elif mode == 'infer':
+                    img_path = os.path.join(DATA_DIR, line)
+                    yield [img_path]
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def train(file_list=TRAIN_LIST):
+    return _reader_creator(
+        file_list, 'train', shuffle=True, color_jitter=False, rotate=False)
+
+
+def test(file_list=TEST_LIST):
+    return _reader_creator(file_list, 'test', shuffle=False)
+
+
+def infer(file_list):
+    return _reader_creator(file_list, 'infer', shuffle=False)
diff --git a/image_classification/run.xsh b/image_classification/run.xsh
new file mode 100755
index 00000000..f003feb2
--- /dev/null
+++ b/image_classification/run.xsh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+cudaid=${multi_se_resnext_cudaid:=0,1,2,3} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true python train.py --batch_size=64
+mv train_speed_kpi_factor.txt four_card_train_speed_kpi_factor.txt
+
+cudaid=${se_resnext_cudaid:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true python train.py --batch_size=64
diff --git a/image_classification/se_resnext.py b/image_classification/se_resnext.py
new file mode 100644
index 00000000..ad533c75
--- /dev/null
+++ b/image_classification/se_resnext.py
@@ -0,0 +1,138 @@
+import os
+import numpy as np
+import time
+import sys
+import paddle
+import paddle.fluid as fluid
+import reader
+import paddle.fluid.layers.control_flow as control_flow
+import paddle.fluid.layers.nn as nn
+import paddle.fluid.layers.tensor as tensor
+import math
+
+
+def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) / 2,
+        groups=groups,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act)
+
+
+def squeeze_excitation(input, num_channels, reduction_ratio):
+    pool = fluid.layers.pool2d(
+        input=input, pool_size=0, pool_type='avg', global_pooling=True)
+    stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+    squeeze = fluid.layers.fc(input=pool,
+                              size=num_channels / reduction_ratio,
+                              act='relu',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+    stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+    excitation = fluid.layers.fc(input=squeeze,
+                                 size=num_channels,
+                                 act='sigmoid',
+                                 param_attr=fluid.param_attr.ParamAttr(
+                                     initializer=fluid.initializer.Uniform(
+                                         -stdv, stdv)))
+    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+    return scale
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out or stride != 1:
+        filter_size = 1
+        return conv_bn_layer(input, ch_out, filter_size, stride)
+    else:
+        return input
+
+
+def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
+    conv0 = conv_bn_layer(
+        input=input, num_filters=num_filters, filter_size=1, act='relu')
+    conv1 = conv_bn_layer(
+        input=conv0,
+        num_filters=num_filters,
+        filter_size=3,
+        stride=stride,
+        groups=cardinality,
+        act='relu')
+    conv2 = conv_bn_layer(
+        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+    scale = squeeze_excitation(
+        input=conv2,
+        num_channels=num_filters * 2,
+        reduction_ratio=reduction_ratio)
+
+    short = shortcut(input, num_filters * 2, stride)
+
+    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+
+def SE_ResNeXt(input, class_dim, infer=False, layers=50):
+    supported_layers = [50, 152]
+    if layers not in supported_layers:
+        print("supported layers are", supported_layers, \
+              "but input layer is ", layers)
+        exit()
+    if layers == 50:
+        cardinality = 32
+        reduction_ratio = 16
+        depth = [3, 4, 6, 3]
+        num_filters = [128, 256, 512, 1024]
+
+        conv = conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+    elif layers == 152:
+        cardinality = 64
+        reduction_ratio = 16
+        depth = [3, 8, 36, 3]
+        num_filters = [128, 256, 512, 1024]
+
+        conv = conv_bn_layer(
+            input=input, num_filters=64, filter_size=3, stride=2, act='relu')
+        conv = conv_bn_layer(
+            input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+        conv = conv_bn_layer(
+            input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+            pool_type='max')
+
+    for block in range(len(depth)):
+        for i in range(depth[block]):
+            conv = bottleneck_block(
+                input=conv,
+                num_filters=num_filters[block],
+                stride=2 if i == 0 and block != 0 else 1,
+                cardinality=cardinality,
+                reduction_ratio=reduction_ratio)
+
+    pool = fluid.layers.pool2d(
+        input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+    if not infer:
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+    else:
+        drop = pool
+    stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+    out = fluid.layers.fc(input=drop,
+                          size=class_dim,
+                          act='softmax',
+                          param_attr=fluid.param_attr.ParamAttr(
+                              initializer=fluid.initializer.Uniform(-stdv,
+                                                                    stdv)))
+    return out
diff --git a/image_classification/train.py b/image_classification/train.py
new file mode 100644
index 00000000..c7a4fb43
--- /dev/null
+++ b/image_classification/train.py
@@ -0,0 +1,407 @@
+import os
+import numpy as np
+import time
+import sys
+import paddle
+import paddle.fluid as fluid
+from se_resnext import SE_ResNeXt
+from mobilenet import mobile_net
+import paddle.dataset.flowers as flowers
+import reader
+
+import argparse
+import functools
+import paddle.fluid.layers.ops as ops
+from utility import add_arguments, print_arguments
+from paddle.fluid.initializer import init_on_cpu
+from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+import math
+
+from continuous_evaluation import (train_acc_top1_kpi, train_acc_top5_kpi,
+                                   train_cost_kpi, train_speed_kpi)
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg('batch_size',   int,  256, "Minibatch size.")
+add_arg('num_layers',   int,  50,  "How many layers for SE-ResNeXt model.")
+add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
+add_arg('parallel_exe', bool, True, "Whether to use ParallelExecutor to train or not.")
+add_arg('init_model', str, None, "Whether to use initialized model.")
+add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
+add_arg('lr_strategy', str, "cosine_decay",
+        "Set the learning rate decay strategy.")
+add_arg('model', str, "se_resnext", "Set the network to use.")
+
+
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
+
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
+
+
+def train_parallel_do(args,
+                      learning_rate,
+                      batch_size,
+                      num_passes,
+                      init_model=None,
+                      pretrained_model=None,
+                      model_save_dir='model',
+                      parallel=True,
+                      use_nccl=True,
+                      lr_strategy=None,
+                      layers=50):
+    class_dim = 1000
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
+
+        with pd.do():
+            image_ = pd.read_input(image)
+            label_ = pd.read_input(label)
+            if args.model is 'se_resnext':
+                out = SE_ResNeXt(
+                    input=image_, class_dim=class_dim, layers=layers)
+            else:
+                out = mobile_net(img=image_, class_dim=class_dim)
+
+            cost = fluid.layers.cross_entropy(input=out, label=label_)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
+            pd.write_output(avg_cost)
+            pd.write_output(acc_top1)
+            pd.write_output(acc_top5)
+
+        avg_cost, acc_top1, acc_top5 = pd()
+        avg_cost = fluid.layers.mean(x=avg_cost)
+        acc_top1 = fluid.layers.mean(x=acc_top1)
+        acc_top5 = fluid.layers.mean(x=acc_top5)
+    else:
+        if args.model is 'se_resnext':
+            out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
+        else:
+            out = mobile_net(img=image, class_dim=class_dim)
+
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    inference_program = fluid.default_main_program().clone(for_test=True)
+
+    if "piecewise_decay" in lr_strategy:
+        bd = lr_strategy["piecewise_decay"]["bd"]
+        lr = lr_strategy["piecewise_decay"]["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    elif "cosine_decay" in lr_strategy:
+        step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"]
+        epochs = lr_strategy["cosine_decay"]["epochs"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=cosine_decay(
+                learning_rate=learning_rate,
+                step_each_epoch=step_each_epoch,
+                epochs=epochs),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+
+    opts = optimizer.minimize(avg_cost)
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if init_model is not None:
+        fluid.io.load_persistables(exe, init_model)
+
+    if pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
+    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
+    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
+        for batch_id, data in enumerate(train_reader()):
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            train_info[0].append(loss[0])
+            train_info[1].append(acc1[0])
+            train_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
+        for data in test_reader():
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                inference_program,
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            test_info[0].append(loss[0])
+            test_info[1].append(acc1[0])
+            test_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5))
+        sys.stdout.flush()
+
+        model_path = os.path.join(model_save_dir + '/' + args.model,
+                                  str(pass_id))
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+
+def train_parallel_exe(args,
+                       learning_rate,
+                       batch_size,
+                       num_passes,
+                       init_model=None,
+                       pretrained_model=None,
+                       model_save_dir='model',
+                       parallel=True,
+                       use_nccl=True,
+                       lr_strategy=None,
+                       layers=50):
+    class_dim = 1000
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    if args.model is 'se_resnext':
+        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
+    else:
+        out = mobile_net(img=image, class_dim=class_dim)
+
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    if "piecewise_decay" in lr_strategy:
+        bd = lr_strategy["piecewise_decay"]["bd"]
+        lr = lr_strategy["piecewise_decay"]["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    elif "cosine_decay" in lr_strategy:
+        step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"]
+        epochs = lr_strategy["cosine_decay"]["epochs"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=cosine_decay(
+                learning_rate=learning_rate,
+                step_each_epoch=step_each_epoch,
+                epochs=epochs),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+
+    opts = optimizer.minimize(avg_cost)
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    fluid.default_startup_program.random_seed = 1000
+    exe.run(fluid.default_startup_program())
+
+    if init_model is not None:
+        fluid.io.load_persistables(exe, init_model)
+
+    if pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
+    train_reader = paddle.batch(flowers.train(), batch_size=batch_size)
+    test_reader = paddle.batch(flowers.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=True, main_program=test_program, share_vars_from=train_exe)
+
+    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
+    train_speed = []
+    for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
+        pass_time = 0
+        pass_num = 0
+        pass_speed = 0.0
+        for batch_id, data in enumerate(train_reader()):
+            t1 = time.time()
+            loss, acc1, acc5 = train_exe.run(fetch_list,
+                                             feed=feeder.feed(data))
+            t2 = time.time()
+            period = t2 - t1
+            pass_time += period
+            pass_num += len(data)
+            loss = np.mean(np.array(loss))
+            acc1 = np.mean(np.array(acc1))
+            acc5 = np.mean(np.array(acc5))
+            train_info[0].append(loss)
+            train_info[1].append(acc1)
+            train_info[2].append(acc5)
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss, acc1, acc5, \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
+        pass_speed = pass_num / pass_time
+        train_speed.append(pass_speed)
+        if pass_id == num_passes - 1:
+            train_acc_top1_kpi.add_record(train_acc1)
+            train_acc_top5_kpi.add_record(train_acc5)
+            train_cost_kpi.add_record(train_loss)
+            mean_pass_speed = np.array(pass_speed).mean()
+            train_speed_kpi.add_record(mean_pass_speed)
+        for data in test_reader():
+            t1 = time.time()
+            loss, acc1, acc5 = test_exe.run(fetch_list, feed=feeder.feed(data))
+            t2 = time.time()
+            period = t2 - t1
+            loss = np.mean(np.array(loss))
+            acc1 = np.mean(np.array(acc1))
+            acc5 = np.mean(np.array(acc5))
+            test_info[0].append(loss)
+            test_info[1].append(acc1)
+            test_info[2].append(acc5)
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss, acc1, acc5, \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}, pass_time {7}, train_speed {8}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5, pass_time, pass_num / pass_time))
+        sys.stdout.flush()
+    train_acc_top1_kpi.persist()
+    train_acc_top5_kpi.persist()
+    train_cost_kpi.persist()
+    train_speed_kpi.persist()
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    total_images = 1281167
+    batch_size = args.batch_size
+    step = int(total_images / batch_size + 1)
+    num_epochs = 5
+
+    learning_rate_mode = args.lr_strategy
+    lr_strategy = {}
+    if learning_rate_mode == "piecewise_decay":
+        epoch_points = [30, 60, 90]
+        bd = [e * step for e in epoch_points]
+        lr = [0.1, 0.01, 0.001, 0.0001]
+        lr_strategy[learning_rate_mode] = {"bd": bd, "lr": lr}
+    elif learning_rate_mode == "cosine_decay":
+        lr_strategy[learning_rate_mode] = {
+            "step_each_epoch": step,
+            "epochs": num_epochs
+        }
+    else:
+        lr_strategy = None
+
+    use_nccl = True
+    # layers: 50, 152
+    layers = args.num_layers
+    method = train_parallel_exe if args.parallel_exe else train_parallel_do
+    init_model = args.init_model if args.init_model else None
+    pretrained_model = args.pretrained_model if args.pretrained_model else None
+    method(
+        args,
+        learning_rate=0.1,
+        batch_size=batch_size,
+        num_passes=num_epochs,
+        init_model=init_model,
+        pretrained_model=pretrained_model,
+        parallel=True,
+        use_nccl=True,
+        lr_strategy=lr_strategy,
+        layers=layers)
diff --git a/image_classification/utility.py b/image_classification/utility.py
new file mode 100644
index 00000000..506e6007
--- /dev/null
+++ b/image_classification/utility.py
@@ -0,0 +1,62 @@
+"""Contains common utility functions."""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import numpy as np
+from paddle.fluid import core
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).iteritems()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
diff --git a/language_model/README.md b/language_model/README.md
new file mode 100644
index 00000000..91ce2d7f
--- /dev/null
+++ b/language_model/README.md
@@ -0,0 +1,148 @@
+# 语言模型
+
+以下是本例的简要目录结构及说明：
+
+```text
+.
+├── README.md            # 文档
+├── train.py             # 训练脚本
+├── infer.py             # 预测脚本
+└── utils.py             # 通用函数
+```
+
+
+## 简介
+
+循环神经网络语言模型的介绍可以参阅论文[Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329)，在本例中，我们实现了GRU-RNN语言模型。
+
+## 训练
+
+运行命令 `python train.py` 开始训练模型。
+```python
+python train.py
+```
+
+当前支持的参数可参见[train.py](./train.py) `train_net` 函数
+```python
+vocab, train_reader, test_reader = utils.prepare_data(
+        batch_size=20, # batch size
+        buffer_size=1000, # buffer size, default value is OK
+        word_freq_threshold=0) # vocabulary related parameter, and words with frequency below this value will be filtered
+
+train(train_reader=train_reader,
+        vocab=vocab,
+        network=network,
+        hid_size=200, # embedding and hidden size
+        base_lr=1.0, # base learning rate
+        batch_size=20, # batch size, the same as that in prepare_data
+        pass_num=12, # the number of passes for training
+        use_cuda=True, # whether to use GPU card
+        parallel=False, # whether to be parallel
+        model_dir="model", # directory to save model
+        init_low_bound=-0.1, # uniform parameter initialization lower bound
+        init_high_bound=0.1) # uniform parameter initialization upper bound
+```
+
+## 自定义网络结构
+
+可在[train.py](./train.py) `network` 函数中调整网络结构，当前的网络结构如下：
+```python
+emb = fluid.layers.embedding(input=src, size=[vocab_size, hid_size],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
+            learning_rate=emb_lr_x),
+        is_sparse=True)
+
+fc0 = fluid.layers.fc(input=emb, size=hid_size * 3,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
+            learning_rate=gru_lr_x))
+gru_h0 = fluid.layers.dynamic_gru(input=fc0, size=hid_size,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
+            learning_rate=gru_lr_x))
+
+fc = fluid.layers.fc(input=gru_h0, size=vocab_size, act='softmax',
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
+            learning_rate=fc_lr_x))
+
+cost = fluid.layers.cross_entropy(input=fc, label=dst)
+```
+
+## 训练结果示例
+
+我们在Tesla K40m单GPU卡上训练的日志如下所示
+```text
+epoch_1 start
+step:100 ppl:771.053
+step:200 ppl:449.597
+step:300 ppl:642.654
+step:400 ppl:458.128
+step:500 ppl:510.912
+step:600 ppl:451.545
+step:700 ppl:364.404
+step:800 ppl:324.272
+step:900 ppl:360.797
+step:1000 ppl:275.761
+step:1100 ppl:294.599
+step:1200 ppl:335.877
+step:1300 ppl:185.262
+step:1400 ppl:241.744
+step:1500 ppl:211.507
+step:1600 ppl:233.431
+step:1700 ppl:298.767
+step:1800 ppl:203.403
+step:1900 ppl:158.828
+step:2000 ppl:171.148
+step:2100 ppl:280.884
+epoch:1 num_steps:2104 time_cost(s):47.478780
+model saved in model/epoch_1
+epoch_2 start
+step:100 ppl:238.099
+step:200 ppl:136.527
+step:300 ppl:204.184
+step:400 ppl:252.886
+step:500 ppl:177.377
+step:600 ppl:197.688
+step:700 ppl:131.650
+step:800 ppl:223.906
+step:900 ppl:144.785
+step:1000 ppl:176.286
+step:1100 ppl:148.158
+step:1200 ppl:203.581
+step:1300 ppl:168.208
+step:1400 ppl:159.412
+step:1500 ppl:114.032
+step:1600 ppl:157.985
+step:1700 ppl:147.743
+step:1800 ppl:88.676
+step:1900 ppl:141.962
+step:2000 ppl:106.087
+step:2100 ppl:122.709
+epoch:2 num_steps:2104 time_cost(s):47.583789
+model saved in model/epoch_2
+...
+```
+
+## 预测
+运行命令 `python infer.py model_dir start_epoch last_epoch(inclusive)` 开始预测，其中，start_epoch指定开始预测的轮次，last_epoch指定结束的轮次，例如
+```python
+python infer.py model 1 12 # prediction from epoch 1 to epoch 12
+```
+
+## 预测结果示例
+```text
+model:model/epoch_1 ppl:254.540 time_cost(s):3.29
+model:model/epoch_2 ppl:177.671 time_cost(s):3.27
+model:model/epoch_3 ppl:156.251 time_cost(s):3.27
+model:model/epoch_4 ppl:139.036 time_cost(s):3.27
+model:model/epoch_5 ppl:132.661 time_cost(s):3.27
+model:model/epoch_6 ppl:130.092 time_cost(s):3.28
+model:model/epoch_7 ppl:128.751 time_cost(s):3.27
+model:model/epoch_8 ppl:125.411 time_cost(s):3.27
+model:model/epoch_9 ppl:124.604 time_cost(s):3.28
+model:model/epoch_10 ppl:124.754 time_cost(s):3.29
+model:model/epoch_11 ppl:125.421 time_cost(s):3.27
+model:model/epoch_12 ppl:125.676 time_cost(s):3.27
+```
diff --git a/language_model/continuous_evaluation.py b/language_model/continuous_evaluation.py
new file mode 100644
index 00000000..a7bc53bd
--- /dev/null
+++ b/language_model/continuous_evaluation.py
@@ -0,0 +1,17 @@
+"""
+continuous_evaluation.py
+"""
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0)
+imikolov_20_pass_duration_kpi = DurationKpi('imikolov_20_pass_duration', 0.02,
+                                            0, actived=True)
+
+tracking_kpis = [
+    imikolov_20_avg_ppl_kpi,
+    imikolov_20_pass_duration_kpi,
+]
diff --git a/language_model/infer.py b/language_model/infer.py
new file mode 100644
index 00000000..a183d548
--- /dev/null
+++ b/language_model/infer.py
@@ -0,0 +1,65 @@
+import sys
+import time
+import math
+import unittest
+import contextlib
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+import utils
+
+
+def infer(test_reader, use_cuda, model_path):
+    """ inference function """
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    with fluid.scope_guard(fluid.core.Scope()):
+        infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(
+            model_path, exe)
+
+        accum_cost = 0.0
+        accum_words = 0
+        t0 = time.time()
+        for data in test_reader():
+            src_wordseq = utils.to_lodtensor(map(lambda x: x[0], data), place)
+            dst_wordseq = utils.to_lodtensor(map(lambda x: x[1], data), place)
+            avg_cost = exe.run(
+                infer_program,
+                feed={"src_wordseq": src_wordseq,
+                      "dst_wordseq": dst_wordseq},
+                fetch_list=fetch_vars)
+
+            nwords = src_wordseq.lod()[0][-1]
+
+            cost = np.array(avg_cost) * nwords
+            accum_cost += cost
+            accum_words += nwords
+
+        ppl = math.exp(accum_cost / accum_words)
+        t1 = time.time()
+        print("model:%s ppl:%.3f time_cost(s):%.2f" %
+              (model_path, ppl, t1 - t0))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
+        exit(0)
+
+    model_dir = sys.argv[1]
+    try:
+        start_index = int(sys.argv[2])
+        last_index = int(sys.argv[3])
+    except:
+        print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
+        exit(-1)
+
+    vocab, train_reader, test_reader = utils.prepare_data(
+        batch_size=20, buffer_size=1000, word_freq_threshold=0)
+
+    for epoch in xrange(start_index, last_index + 1):
+        epoch_path = model_dir + "/epoch_" + str(epoch)
+        infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path)
diff --git a/language_model/latest_kpis/imikolov_20_avg_ppl_factor.txt b/language_model/latest_kpis/imikolov_20_avg_ppl_factor.txt
new file mode 100644
index 00000000..b570c294
--- /dev/null
+++ b/language_model/latest_kpis/imikolov_20_avg_ppl_factor.txt
@@ -0,0 +1 @@
+[32.465272032979705]
diff --git a/language_model/latest_kpis/imikolov_20_pass_duration_factor.txt b/language_model/latest_kpis/imikolov_20_pass_duration_factor.txt
new file mode 100644
index 00000000..c9ab10b4
--- /dev/null
+++ b/language_model/latest_kpis/imikolov_20_pass_duration_factor.txt
@@ -0,0 +1 @@
+[29.741339857578278]
diff --git a/language_model/run.xsh b/language_model/run.xsh
new file mode 100755
index 00000000..5a40853d
--- /dev/null
+++ b/language_model/run.xsh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+cudaid=${language_model:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py
diff --git a/language_model/train.py b/language_model/train.py
new file mode 100644
index 00000000..773c7431
--- /dev/null
+++ b/language_model/train.py
@@ -0,0 +1,171 @@
+import sys
+import time
+
+import numpy as np
+import math
+
+import paddle.fluid as fluid
+import paddle
+
+import utils
+
+from continuous_evaluation import imikolov_20_avg_ppl_kpi, imikolov_20_pass_duration_kpi
+
+
+def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
+    """ network definition """
+    emb_lr_x = 10.0
+    gru_lr_x = 1.0
+    fc_lr_x = 1.0
+    emb = fluid.layers.embedding(
+        input=src,
+        size=[vocab_size, hid_size],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=init_low_bound, high=init_high_bound),
+            learning_rate=emb_lr_x),
+        is_sparse=True)
+
+    fc0 = fluid.layers.fc(input=emb,
+                          size=hid_size * 3,
+                          param_attr=fluid.ParamAttr(
+                              initializer=fluid.initializer.Uniform(
+                                  low=init_low_bound, high=init_high_bound),
+                              learning_rate=gru_lr_x))
+    gru_h0 = fluid.layers.dynamic_gru(
+        input=fc0,
+        size=hid_size,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=init_low_bound, high=init_high_bound),
+            learning_rate=gru_lr_x))
+
+    fc = fluid.layers.fc(input=gru_h0,
+                         size=vocab_size,
+                         act='softmax',
+                         param_attr=fluid.ParamAttr(
+                             initializer=fluid.initializer.Uniform(
+                                 low=init_low_bound, high=init_high_bound),
+                             learning_rate=fc_lr_x))
+
+    cost = fluid.layers.cross_entropy(input=fc, label=dst)
+    return cost
+
+
+def train(train_reader,
+          vocab,
+          network,
+          hid_size,
+          base_lr,
+          batch_size,
+          pass_num,
+          use_cuda,
+          parallel,
+          model_dir,
+          init_low_bound=-0.04,
+          init_high_bound=0.04):
+    """ train network """
+    vocab_size = len(vocab)
+
+    src_wordseq = fluid.layers.data(
+        name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
+    dst_wordseq = fluid.layers.data(
+        name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
+
+    avg_cost = None
+    if not parallel:
+        cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
+                       init_low_bound, init_high_bound)
+        avg_cost = fluid.layers.mean(x=cost)
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            cost = network(
+                pd.read_input(src_wordseq),
+                pd.read_input(dst_wordseq), vocab_size, hid_size,
+                init_low_bound, init_high_bound)
+            pd.write_output(cost)
+
+        cost = pd()
+        avg_cost = fluid.layers.mean(x=cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.layers.exponential_decay(
+            learning_rate=base_lr,
+            decay_steps=2100 * 4,
+            decay_rate=0.5,
+            staircase=True))
+    sgd_optimizer.minimize(avg_cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+    total_time = 0.0
+    for pass_idx in xrange(pass_num):
+        epoch_idx = pass_idx + 1
+        print "epoch_%d start" % epoch_idx
+
+        t0 = time.time()
+        i = 0
+        newest_ppl = 0
+        for data in train_reader():
+            i += 1
+            lod_src_wordseq = utils.to_lodtensor(
+                map(lambda x: x[0], data), place)
+            lod_dst_wordseq = utils.to_lodtensor(
+                map(lambda x: x[1], data), place)
+            ret_avg_cost = exe.run(fluid.default_main_program(),
+                                   feed={
+                                       "src_wordseq": lod_src_wordseq,
+                                       "dst_wordseq": lod_dst_wordseq
+                                   },
+                                   fetch_list=[avg_cost],
+                                   use_program_cache=True)
+            avg_ppl = math.exp(ret_avg_cost[0])
+            newest_ppl = avg_ppl
+            if i % 100 == 0:
+                print "step:%d ppl:%.3f" % (i, avg_ppl)
+
+        t1 = time.time()
+        total_time += t1 - t0
+        print "epoch:%d num_steps:%d time_cost(s):%f" % (
+            epoch_idx, i, total_time / epoch_idx)
+
+        if pass_idx == pass_num - 1:
+            imikolov_20_pass_duration_kpi.add_record(total_time / epoch_idx)
+            imikolov_20_avg_ppl_kpi.add_record(newest_ppl)
+        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
+        feed_var_names = ["src_wordseq", "dst_wordseq"]
+        fetch_vars = [avg_cost]
+        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars,
+                                      exe)
+        print("model saved in %s" % save_dir)
+    imikolov_20_pass_duration_kpi.persist()
+    imikolov_20_avg_ppl_kpi.persist()
+    print("finish training")
+
+
+def train_net():
+    """ do training """
+    batch_size = 20
+    vocab, train_reader, test_reader = utils.prepare_data(
+        batch_size=batch_size, buffer_size=1000, word_freq_threshold=0)
+    train(
+        train_reader=train_reader,
+        vocab=vocab,
+        network=network,
+        hid_size=200,
+        base_lr=1.0,
+        batch_size=batch_size,
+        pass_num=12,
+        use_cuda=True,
+        parallel=True,
+        model_dir="model",
+        init_low_bound=-0.1,
+        init_high_bound=0.1)
+
+
+if __name__ == "__main__":
+    train_net()
diff --git a/language_model/utils.py b/language_model/utils.py
new file mode 100644
index 00000000..9ca0ef4d
--- /dev/null
+++ b/language_model/utils.py
@@ -0,0 +1,41 @@
+import sys
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle
+
+
+def to_lodtensor(data, place):
+    """ convert to LODtensor """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0):
+    """ prepare the English Pann Treebank (PTB) data """
+    vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imikolov.train(
+                vocab,
+                buffer_size,
+                data_type=paddle.dataset.imikolov.DataType.SEQ),
+            buf_size=buffer_size),
+        batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(
+            vocab, buffer_size,
+            data_type=paddle.dataset.imikolov.DataType.SEQ),
+        batch_size)
+    return vocab, train_reader, test_reader
diff --git a/lstm/continuous_evaluation.py b/lstm/continuous_evaluation.py
new file mode 100644
index 00000000..e7efb66a
--- /dev/null
+++ b/lstm/continuous_evaluation.py
@@ -0,0 +1,17 @@
+"""
+continuous_evaluation.py
+"""
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import AccKpi
+from kpi import CostKpi
+from kpi import DurationKpi
+
+imdb_32_train_speed_kpi = AccKpi('imdb_32_train_speed', 0.03, 0, actived=True)
+imdb_32_gpu_memory_kpi = DurationKpi('imdb_32_gpu_memory', 0.05, 0, actived=True)
+
+tracking_kpis = [
+    imdb_32_train_speed_kpi,
+    imdb_32_gpu_memory_kpi,
+]
diff --git a/lstm/get_gpu_data.py b/lstm/get_gpu_data.py
new file mode 100644
index 00000000..7afb3fdb
--- /dev/null
+++ b/lstm/get_gpu_data.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+########################################################################
+# 
+# Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved
+# 
+########################################################################
+"""
+File: get_gpu_data.py
+Author: paddle(paddle@baidu.com)
+Date: 2018/04/02 15:57:14
+"""
+import argparse
+from continuous_evaluation import tracking_kpis
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='imdb',
+    help='Optional dataset for benchmark.')
+args = parser.parse_args()
+
+
+def save_gpu_data():
+    mem_list = []
+    with open('memory.txt', 'r') as f:
+        for i, data in enumerate(f.readlines()):
+            if i == 0:
+                continue
+            mem_list.append(int(data.split("\n")[0].split(" ")[0]))
+    gpu_memory_factor = None
+    for kpi in tracking_kpis:
+        if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size):
+            gpu_memory_kpi = kpi
+    gpu_memory_kpi.add_record(max(mem_list))
+    gpu_memory_kpi.persist()
+
+
+if __name__ == "__main__":
+    save_gpu_data()
diff --git a/lstm/latest_kpis/imdb_32_gpu_memory_factor.txt b/lstm/latest_kpis/imdb_32_gpu_memory_factor.txt
new file mode 100644
index 00000000..7a9fb042
--- /dev/null
+++ b/lstm/latest_kpis/imdb_32_gpu_memory_factor.txt
@@ -0,0 +1 @@
+[1560]
diff --git a/lstm/latest_kpis/imdb_32_train_speed_factor.txt b/lstm/latest_kpis/imdb_32_train_speed_factor.txt
new file mode 100644
index 00000000..1f3013cb
--- /dev/null
+++ b/lstm/latest_kpis/imdb_32_train_speed_factor.txt
@@ -0,0 +1 @@
+[779.2451171875]
diff --git a/lstm/model.py b/lstm/model.py
new file mode 100644
index 00000000..418a79b6
--- /dev/null
+++ b/lstm/model.py
@@ -0,0 +1,283 @@
+"""
+stacked_dynamic_lstm model for fluid
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import commands
+import subprocess
+import threading
+import time
+import numpy as np
+
+import numpy
+import paddle
+import paddle.dataset.imdb as imdb
+import paddle.fluid as fluid
+import paddle.batch as batch
+import paddle.fluid.profiler as profiler
+
+from continuous_evaluation import tracking_kpis
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=80,
+        help='The number of minibatches.')
+    parser.add_argument(
+        '--emb_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=100,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='CPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpu_id',
+        type=int,
+        default=3,
+        help='The GPU Card Id. (default: %(default)d)')
+    parser.add_argument(
+        '--crop_size',
+        type=int,
+        default=int(os.environ.get('CROP_SIZE', '1500')),
+        help='The max sentence length of input. Since this model use plain RNN,'
+        ' Gradient could be explored if sentence is too long')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['<unk>']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def main():
+    args = parse_args()
+    lstm_size = args.hidden_dim
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), args.emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+    adam.minimize(loss)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.train(word_dict), args.crop_size),
+            buf_size=25000),
+        batch_size=args.batch_size)
+
+    train_acc_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == 'imdb_%s_train_acc' % (args.batch_size):
+            train_acc_kpi = kpi
+    train_speed_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == 'imdb_%s_train_speed' % (args.batch_size):
+            train_speed_kpi = kpi
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            tensor_words = to_lodtensor([x[0] for x in data], place)
+            label = numpy.array([x[1] for x in data]).astype("int64")
+            label = label.reshape((-1, 1))
+            loss_np, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"words": tensor_words,
+                      "label": label},
+                fetch_list=[loss, batch_acc, batch_size_tensor])
+            iters += 1
+            for x in data:
+                num_samples += len(x[0])
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss_np, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32'))
+        break
+    train_speed_kpi.persist()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def print_arguments(args):
+    print('----------- lstm Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def collect_gpu_memory_data(alive):
+    """
+    collect the GPU memory data
+    """
+    global is_alive
+    status, output = commands.getstatusoutput('rm -rf memory.txt')
+    if status == 0:
+        print('del memory.txt')
+    command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id
+    p = subprocess.Popen(command, shell=True)
+    if p.pid < 0:
+        print('Get GPU memory data error')
+    while (is_alive):
+        time.sleep(1)
+    p.kill()
+
+
+def save_gpu_data(mem_list):
+    gpu_memory_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == 'imdb_%s_gpu_memory' % (args.batch_size):
+            gpu_memory_kpi = kpi
+    gpu_memory_kpi.add_record(max(mem_list))
+    gpu_memory_kpi.persist()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    global is_alive
+    is_alive = True
+    collect_memory_thread = threading.Thread(
+        target=collect_gpu_memory_data, args=(is_alive, ))
+    collect_memory_thread.setDaemon(True)
+    collect_memory_thread.start()
+    main()
+    is_alive = False
diff --git a/lstm/run.xsh b/lstm/run.xsh
new file mode 100755
index 00000000..d184f534
--- /dev/null
+++ b/lstm/run.xsh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+cudaid=${lstm_cudaid:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+#imdb 32
+FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=32 --iterations=50 --gpu_id=$cudaid
+python get_gpu_data.py --batch_size=32 --data_set=imdb
+for pid in $(ps -ef | grep nvidia-smi | grep -v grep | cut -c 9-15); do
+    echo $pid
+    kill -9 $pid
+done
diff --git a/mnist/continuous_evaluation.py b/mnist/continuous_evaluation.py
index 0698887b..25047026 100644
--- a/mnist/continuous_evaluation.py
+++ b/mnist/continuous_evaluation.py
@@ -3,12 +3,14 @@
 sys.path.append(os.environ['ceroot'])
 from kpi import CostKpi, DurationKpi, AccKpi
 
-train_acc_kpi = AccKpi('train_acc', 0.05)
-test_acc_kpi = AccKpi('test_acc', 0.05)
-train_duration_kpi = DurationKpi('train_duration', 0.1)
+train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
+test_acc_kpi = AccKpi('test_acc', 0.005, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.02, actived=True)
+train_acc_kpi = AccKpi('train_acc', 0.005, actived=True)
 
 tracking_kpis = [
     train_acc_kpi,
+    train_cost_kpi,
     test_acc_kpi,
     train_duration_kpi,
 ]
diff --git a/mnist/latest_kpis/test_acc_factor.txt b/mnist/latest_kpis/test_acc_factor.txt
index f1fd6659..bdfcae70 100644
--- a/mnist/latest_kpis/test_acc_factor.txt
+++ b/mnist/latest_kpis/test_acc_factor.txt
@@ -1,5 +1,5 @@
-[0.9749000072479248]
-[0.9811000227928162]
-[0.9858999848365784]
-[0.9860000014305115]
-[0.9872000217437744]
\ No newline at end of file
+[0.9768999814987183]
+[0.9839000105857849]
+[0.9868000149726868]
+[0.9866999983787537]
+[0.9879000186920166]
diff --git a/mnist/latest_kpis/train_acc_factor.txt b/mnist/latest_kpis/train_acc_factor.txt
index 55a945e1..856ba0ff 100644
--- a/mnist/latest_kpis/train_acc_factor.txt
+++ b/mnist/latest_kpis/train_acc_factor.txt
@@ -1,5 +1,5 @@
-[0.9435666799545288]
-[0.982283353805542]
-[0.9876833558082581]
-[0.9906833171844482]
-[0.9932000041007996]
\ No newline at end of file
+[0.9471499919891357]
+[0.9831333160400391]
+[0.9886166453361511]
+[0.9915000200271606]
+[0.9929666519165039]
diff --git a/mnist/latest_kpis/train_cost_factor.txt b/mnist/latest_kpis/train_cost_factor.txt
new file mode 100644
index 00000000..7d7bd861
--- /dev/null
+++ b/mnist/latest_kpis/train_cost_factor.txt
@@ -0,0 +1,5 @@
+[0.05625442788004875]
+[0.0373283299320031]
+[0.0393865630030632]
+[0.029800457879900932]
+[0.02382788062095642]
diff --git a/mnist/latest_kpis/train_duration_factor.txt b/mnist/latest_kpis/train_duration_factor.txt
index 8253e948..ee89488a 100644
--- a/mnist/latest_kpis/train_duration_factor.txt
+++ b/mnist/latest_kpis/train_duration_factor.txt
@@ -1,5 +1,5 @@
-[38.24392104148865]
-[36.998713970184326]
-[36.87090182304382]
-[36.75976610183716]
-[36.79504203796387]
\ No newline at end of file
+[36.52754783630371]
+[36.04332995414734]
+[36.20732808113098]
+[36.188393115997314]
+[35.95417380332947]
diff --git a/mnist/model.py b/mnist/model.py
index f180f3b5..1ae83922 100644
--- a/mnist/model.py
+++ b/mnist/model.py
@@ -10,13 +10,13 @@
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 
-from continuous_evaluation import (train_acc_kpi, test_acc_kpi,
+from continuous_evaluation import (train_acc_kpi, train_cost_kpi, test_acc_kpi,
                                    train_duration_kpi, tracking_kpis)
-SEED = 1
+SEED = 90
 DTYPE = "float32"
 
 # random seed must set before configuring the network.
-# fluid.default_startup_program().random_seed = SEED
+fluid.default_startup_program().random_seed = SEED
 
 
 def parse_args():
@@ -49,8 +49,8 @@ def parse_args():
 
 
 def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof']
-                                and vars(args)['device'] == 'GPU')
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
     print('-----------  Configuration Arguments -----------')
     for arg, value in sorted(vars(args).iteritems()):
         print('%s: %s' % (arg, value))
@@ -99,13 +99,10 @@ def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
         y_data = np.array(map(lambda x: x[1], data)).astype("int64")
         y_data = y_data.reshape([len(y_data), 1])
 
-        acc, weight = exe.run(
-            inference_program,
-            feed={
-                "pixel": img_data,
-                "label": y_data
-            },
-            fetch_list=[batch_acc, batch_size_tensor])
+        acc, weight = exe.run(inference_program,
+                              feed={"pixel": img_data,
+                                    "label": y_data},
+                              fetch_list=[batch_acc, batch_size_tensor])
         test_pass_acc.add(value=acc, weight=weight)
         pass_acc = test_pass_acc.eval()
     return pass_acc
@@ -158,6 +155,7 @@ def run_benchmark(model, args):
     for pass_id in range(args.pass_num):
         accuracy.reset()
         pass_start = time.time()
+        every_pass_loss = []
         for batch_id, data in enumerate(train_reader()):
             img_data = np.array(
                 map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
@@ -165,29 +163,30 @@ def run_benchmark(model, args):
             y_data = y_data.reshape([len(y_data), 1])
 
             start = time.time()
-            outs = exe.run(
+            loss, acc, weight = exe.run(
                 fluid.default_main_program(),
-                feed={
-                    "pixel": img_data,
-                    "label": y_data
-                },
+                feed={"pixel": img_data,
+                      "label": y_data},
                 fetch_list=[avg_cost, batch_acc, batch_size_tensor]
             )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.add(value=outs[1], weight=outs[2])
             end = time.time()
-            loss = np.array(outs[0])
-            acc = np.array(outs[1])
+            accuracy.add(value=acc, weight=weight)
+            every_pass_loss.append(loss)
+            print ("Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                    (pass_id, batch_id, loss, acc))
 
         pass_end = time.time()
 
         train_avg_acc = accuracy.eval()
+        train_avg_loss = np.mean(every_pass_loss)
         test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
                                  inference_program)
 
-        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
-              (pass_id, train_avg_acc, test_avg_acc, (pass_end - pass_start)))
+        print("pass=%d, train_avg_acc=%f,train_avg_loss=%f, test_avg_acc=%f, elapse=%f" %
+              (pass_id, train_avg_acc, train_avg_loss, test_avg_acc, (pass_end - pass_start)))
 
         train_acc_kpi.add_record(np.array(train_avg_acc, dtype='float32'))
+        train_cost_kpi.add_record(np.array(train_avg_loss, dtype='float32'))
         test_acc_kpi.add_record(np.array(test_avg_acc, dtype='float32'))
         train_duration_kpi.add_record(pass_end - pass_start)
 
diff --git a/mnist/run.xsh b/mnist/run.xsh
index 65f75488..504d3c63 100755
--- a/mnist/run.xsh
+++ b/mnist/run.xsh
@@ -2,5 +2,4 @@
 import sys
 
 model_file = 'model.py'
-
 python @(model_file) --batch_size 128 --pass_num 5 --device CPU
diff --git a/object_detection/continuous_evaluation.py b/object_detection/continuous_evaluation.py
new file mode 100644
index 00000000..465a7985
--- /dev/null
+++ b/object_detection/continuous_evaluation.py
@@ -0,0 +1,10 @@
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
+train_speed_kpi = AccKpi('train_speed', 0.02, 0, actived=True)
+four_card_speed_kpi = AccKpi('four_card_train_speed', 0.02, 0, actived=True)
+
+tracking_kpis = [train_cost_kpi, train_speed_kpi, four_card_speed_kpi]
diff --git a/object_detection/download.sh b/object_detection/download.sh
new file mode 100755
index 00000000..fe483255
--- /dev/null
+++ b/object_detection/download.sh
@@ -0,0 +1,19 @@
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+# Download the data.
+cp labels/* data/pascalvoc/
+cd data/pascalvoc
+
+echo "Downloading..."
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+# Extract the data.
+echo "Extractint..."
+tar -xf VOCtrainval_11-May-2012.tar
+tar -xf VOCtrainval_06-Nov-2007.tar
+tar -xf VOCtest_06-Nov-2007.tar
+
+echo "Creating data lists..."
+python create_list.py
diff --git a/object_detection/image_util.py b/object_detection/image_util.py
new file mode 100644
index 00000000..4ce53048
--- /dev/null
+++ b/object_detection/image_util.py
@@ -0,0 +1,234 @@
+from PIL import Image, ImageEnhance
+import numpy as np
+import random
+import math
+
+
+class sampler():
+    def __init__(self, max_sample, max_trial, min_scale, max_scale,
+                 min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap,
+                 max_jaccard_overlap):
+        self.max_sample = max_sample
+        self.max_trial = max_trial
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        self.min_jaccard_overlap = min_jaccard_overlap
+        self.max_jaccard_overlap = max_jaccard_overlap
+
+
+class bbox():
+    def __init__(self, xmin, ymin, xmax, ymax):
+        self.xmin = xmin
+        self.ymin = ymin
+        self.xmax = xmax
+        self.ymax = ymax
+
+
+def bbox_area(src_bbox):
+    width = src_bbox.xmax - src_bbox.xmin
+    height = src_bbox.ymax - src_bbox.ymin
+    return width * height
+
+
+def generate_sample(sampler):
+    scale = random.uniform(sampler.min_scale, sampler.max_scale)
+    min_aspect_ratio = max(sampler.min_aspect_ratio, (scale**2.0))
+    max_aspect_ratio = min(sampler.max_aspect_ratio, 1 / (scale**2.0))
+    aspect_ratio = random.uniform(min_aspect_ratio, max_aspect_ratio)
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = random.uniform(0, xmin_bound)
+    ymin = random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = bbox(xmin, ymin, xmax, ymax)
+    return sampled_bbox
+
+
+def jaccard_overlap(sample_bbox, object_bbox):
+    if sample_bbox.xmin >= object_bbox.xmax or \
+            sample_bbox.xmax <= object_bbox.xmin or \
+            sample_bbox.ymin >= object_bbox.ymax or \
+            sample_bbox.ymax <= object_bbox.ymin:
+        return 0
+    intersect_xmin = max(sample_bbox.xmin, object_bbox.xmin)
+    intersect_ymin = max(sample_bbox.ymin, object_bbox.ymin)
+    intersect_xmax = min(sample_bbox.xmax, object_bbox.xmax)
+    intersect_ymax = min(sample_bbox.ymax, object_bbox.ymax)
+    intersect_size = (intersect_xmax - intersect_xmin) * (
+        intersect_ymax - intersect_ymin)
+    sample_bbox_size = bbox_area(sample_bbox)
+    object_bbox_size = bbox_area(object_bbox)
+    overlap = intersect_size / (
+        sample_bbox_size + object_bbox_size - intersect_size)
+    return overlap
+
+
+def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
+    if sampler.min_jaccard_overlap == 0 and sampler.max_jaccard_overlap == 0:
+        return True
+    for i in range(len(bbox_labels)):
+        object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
+                           bbox_labels[i][3], bbox_labels[i][4])
+        overlap = jaccard_overlap(sample_bbox, object_bbox)
+        if sampler.min_jaccard_overlap != 0 and \
+                overlap < sampler.min_jaccard_overlap:
+            continue
+        if sampler.max_jaccard_overlap != 0 and \
+                overlap > sampler.max_jaccard_overlap:
+            continue
+        return True
+    return False
+
+
+def generate_batch_samples(batch_sampler, bbox_labels):
+    sampled_bbox = []
+    index = []
+    c = 0
+    for sampler in batch_sampler:
+        found = 0
+        for i in range(sampler.max_trial):
+            if found >= sampler.max_sample:
+                break
+            sample_bbox = generate_sample(sampler)
+            if satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
+                sampled_bbox.append(sample_bbox)
+                found = found + 1
+                index.append(c)
+        c = c + 1
+    return sampled_bbox
+
+
+def clip_bbox(src_bbox):
+    src_bbox.xmin = max(min(src_bbox.xmin, 1.0), 0.0)
+    src_bbox.ymin = max(min(src_bbox.ymin, 1.0), 0.0)
+    src_bbox.xmax = max(min(src_bbox.xmax, 1.0), 0.0)
+    src_bbox.ymax = max(min(src_bbox.ymax, 1.0), 0.0)
+    return src_bbox
+
+
+def meet_emit_constraint(src_bbox, sample_bbox):
+    center_x = (src_bbox.xmax + src_bbox.xmin) / 2
+    center_y = (src_bbox.ymax + src_bbox.ymin) / 2
+    if center_x >= sample_bbox.xmin and \
+        center_x <= sample_bbox.xmax and \
+        center_y >= sample_bbox.ymin and \
+        center_y <= sample_bbox.ymax:
+        return True
+    return False
+
+
+def transform_labels(bbox_labels, sample_bbox):
+    proj_bbox = bbox(0, 0, 0, 0)
+    sample_labels = []
+    for i in range(len(bbox_labels)):
+        sample_label = []
+        object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
+                           bbox_labels[i][3], bbox_labels[i][4])
+        if not meet_emit_constraint(object_bbox, sample_bbox):
+            continue
+        sample_width = sample_bbox.xmax - sample_bbox.xmin
+        sample_height = sample_bbox.ymax - sample_bbox.ymin
+        proj_bbox.xmin = (object_bbox.xmin - sample_bbox.xmin) / sample_width
+        proj_bbox.ymin = (object_bbox.ymin - sample_bbox.ymin) / sample_height
+        proj_bbox.xmax = (object_bbox.xmax - sample_bbox.xmin) / sample_width
+        proj_bbox.ymax = (object_bbox.ymax - sample_bbox.ymin) / sample_height
+        proj_bbox = clip_bbox(proj_bbox)
+        if bbox_area(proj_bbox) > 0:
+            sample_label.append(bbox_labels[i][0])
+            sample_label.append(float(proj_bbox.xmin))
+            sample_label.append(float(proj_bbox.ymin))
+            sample_label.append(float(proj_bbox.xmax))
+            sample_label.append(float(proj_bbox.ymax))
+            sample_label.append(bbox_labels[i][5])
+            sample_labels.append(sample_label)
+    return sample_labels
+
+
+def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
+    sample_bbox = clip_bbox(sample_bbox)
+    xmin = int(sample_bbox.xmin * image_width)
+    xmax = int(sample_bbox.xmax * image_width)
+    ymin = int(sample_bbox.ymin * image_height)
+    ymax = int(sample_bbox.ymax * image_height)
+    sample_img = img[ymin:ymax, xmin:xmax]
+    sample_labels = transform_labels(bbox_labels, sample_bbox)
+    return sample_img, sample_labels
+
+
+def random_brightness(img, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._brightness_prob:
+        delta = random.uniform(-settings._brightness_delta,
+                               settings._brightness_delta) + 1
+        img = ImageEnhance.Brightness(img).enhance(delta)
+    return img
+
+
+def random_contrast(img, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._contrast_prob:
+        delta = random.uniform(-settings._contrast_delta,
+                               settings._contrast_delta) + 1
+        img = ImageEnhance.Contrast(img).enhance(delta)
+    return img
+
+
+def random_saturation(img, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._saturation_prob:
+        delta = random.uniform(-settings._saturation_delta,
+                               settings._saturation_delta) + 1
+        img = ImageEnhance.Color(img).enhance(delta)
+    return img
+
+
+def random_hue(img, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._hue_prob:
+        delta = random.uniform(-settings._hue_delta, settings._hue_delta)
+        img_hsv = np.array(img.convert('HSV'))
+        img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
+        img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
+    return img
+
+
+def distort_image(img, settings):
+    prob = random.uniform(0, 1)
+    # Apply different distort order
+    if prob > 0.5:
+        img = random_brightness(img, settings)
+        img = random_contrast(img, settings)
+        img = random_saturation(img, settings)
+        img = random_hue(img, settings)
+    else:
+        img = random_brightness(img, settings)
+        img = random_saturation(img, settings)
+        img = random_hue(img, settings)
+        img = random_contrast(img, settings)
+    return img
+
+
+def expand_image(img, bbox_labels, img_width, img_height, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._expand_prob:
+        if settings._expand_max_ratio - 1 >= 0.01:
+            expand_ratio = random.uniform(1, settings._expand_max_ratio)
+            height = int(img_height * expand_ratio)
+            width = int(img_width * expand_ratio)
+            h_off = math.floor(random.uniform(0, height - img_height))
+            w_off = math.floor(random.uniform(0, width - img_width))
+            expand_bbox = bbox(-w_off / img_width, -h_off / img_height,
+                               (width - w_off) / img_width,
+                               (height - h_off) / img_height)
+            expand_img = np.ones((height, width, 3))
+            expand_img = np.uint8(expand_img * np.squeeze(settings._img_mean))
+            expand_img = Image.fromarray(expand_img)
+            expand_img.paste(img, (int(w_off), int(h_off)))
+            bbox_labels = transform_labels(bbox_labels, expand_bbox)
+            return expand_img, bbox_labels, width, height
+    return img, bbox_labels, img_width, img_height
diff --git a/object_detection/labels/create_list.py b/object_detection/labels/create_list.py
new file mode 100644
index 00000000..8b472aac
--- /dev/null
+++ b/object_detection/labels/create_list.py
@@ -0,0 +1,66 @@
+import os
+import os.path as osp
+import re
+import random
+
+devkit_dir = './VOCdevkit'
+years = ['2007', '2012']
+
+
+def get_dir(devkit_dir, year, type):
+    return osp.join(devkit_dir, 'VOC' + year, type)
+
+
+def walk_dir(devkit_dir, year):
+    filelist_dir = get_dir(devkit_dir, year, 'ImageSets/Main')
+    annotation_dir = get_dir(devkit_dir, year, 'Annotations')
+    img_dir = get_dir(devkit_dir, year, 'JPEGImages')
+    trainval_list = []
+    test_list = []
+    added = set()
+
+    for _, _, files in os.walk(filelist_dir):
+        for fname in files:
+            img_ann_list = []
+            if re.match('[a-z]+_trainval\.txt', fname):
+                img_ann_list = trainval_list
+            elif re.match('[a-z]+_test\.txt', fname):
+                img_ann_list = test_list
+            else:
+                continue
+            fpath = osp.join(filelist_dir, fname)
+            for line in open(fpath):
+                name_prefix = line.strip().split()[0]
+                if name_prefix in added:
+                    continue
+                added.add(name_prefix)
+                ann_path = osp.join(annotation_dir, name_prefix + '.xml')
+                img_path = osp.join(img_dir, name_prefix + '.jpg')
+                assert os.path.isfile(
+                    ann_path), 'file %s not found.' % ann_path
+                assert os.path.isfile(
+                    img_path), 'file %s not found.' % img_path
+                img_ann_list.append((img_path, ann_path))
+
+    return trainval_list, test_list
+
+
+def prepare_filelist(devkit_dir, years, output_dir):
+    trainval_list = []
+    test_list = []
+    for year in years:
+        trainval, test = walk_dir(devkit_dir, year)
+        trainval_list.extend(trainval)
+        test_list.extend(test)
+    random.shuffle(trainval_list)
+    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
+        for item in trainval_list:
+            ftrainval.write(item[0] + ' ' + item[1] + '\n')
+
+    with open(osp.join(output_dir, 'test.txt'), 'w') as ftest:
+        for item in test_list:
+            ftest.write(item[0] + ' ' + item[1] + '\n')
+
+
+if __name__ == '__main__':
+    prepare_filelist(devkit_dir, years, '.')
diff --git a/object_detection/labels/label_list b/object_detection/labels/label_list
new file mode 100644
index 00000000..87df23ce
--- /dev/null
+++ b/object_detection/labels/label_list
@@ -0,0 +1,21 @@
+background
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
diff --git a/object_detection/latest_kpis/four_card_train_speed_factor.txt b/object_detection/latest_kpis/four_card_train_speed_factor.txt
new file mode 100644
index 00000000..8ba14137
--- /dev/null
+++ b/object_detection/latest_kpis/four_card_train_speed_factor.txt
@@ -0,0 +1 @@
+[143.63855412820158]
\ No newline at end of file
diff --git a/object_detection/latest_kpis/train_cost_factor.txt b/object_detection/latest_kpis/train_cost_factor.txt
new file mode 100644
index 00000000..5b830a56
--- /dev/null
+++ b/object_detection/latest_kpis/train_cost_factor.txt
@@ -0,0 +1 @@
+[8.613137321472168]
diff --git a/object_detection/latest_kpis/train_speed_factor.txt b/object_detection/latest_kpis/train_speed_factor.txt
new file mode 100644
index 00000000..11510145
--- /dev/null
+++ b/object_detection/latest_kpis/train_speed_factor.txt
@@ -0,0 +1 @@
+[71.42477785941152]
diff --git a/object_detection/mobilenet_ssd.py b/object_detection/mobilenet_ssd.py
new file mode 100644
index 00000000..06c71d55
--- /dev/null
+++ b/object_detection/mobilenet_ssd.py
@@ -0,0 +1,116 @@
+import paddle as paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+
+def conv_bn(input,
+            filter_size,
+            num_filters,
+            stride,
+            padding,
+            channels=None,
+            num_groups=1,
+            act='relu',
+            use_cudnn=True):
+    parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA())
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        groups=num_groups,
+        act=None,
+        use_cudnn=use_cudnn,
+        param_attr=parameter_attr,
+        bias_attr=False)
+    parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA())
+    bias_attr = ParamAttr(learning_rate=0.2)
+    return fluid.layers.batch_norm(input=conv, act=act)
+
+
+def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride,
+                        scale):
+    depthwise_conv = conv_bn(
+        input=input,
+        filter_size=3,
+        num_filters=int(num_filters1 * scale),
+        stride=stride,
+        padding=1,
+        num_groups=int(num_groups * scale),
+        use_cudnn=False)
+
+    pointwise_conv = conv_bn(
+        input=depthwise_conv,
+        filter_size=1,
+        num_filters=int(num_filters2 * scale),
+        stride=1,
+        padding=0)
+    return pointwise_conv
+
+
+def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale):
+    # 1x1 conv
+    pointwise_conv = conv_bn(
+        input=input,
+        filter_size=1,
+        num_filters=int(num_filters1 * scale),
+        stride=1,
+        num_groups=int(num_groups * scale),
+        padding=0)
+
+    # 3x3 conv
+    normal_conv = conv_bn(
+        input=pointwise_conv,
+        filter_size=3,
+        num_filters=int(num_filters2 * scale),
+        stride=2,
+        num_groups=int(num_groups * scale),
+        padding=1)
+    return normal_conv
+
+
+def mobile_net(num_classes, img, img_shape, scale=1.0):
+    # 300x300
+    tmp = conv_bn(img, 3, int(32 * scale), 2, 1, 3)
+    # 150x150
+    tmp = depthwise_separable(tmp, 32, 64, 32, 1, scale)
+    tmp = depthwise_separable(tmp, 64, 128, 64, 2, scale)
+    # 75x75
+    tmp = depthwise_separable(tmp, 128, 128, 128, 1, scale)
+    tmp = depthwise_separable(tmp, 128, 256, 128, 2, scale)
+    # 38x38
+    tmp = depthwise_separable(tmp, 256, 256, 256, 1, scale)
+    tmp = depthwise_separable(tmp, 256, 512, 256, 2, scale)
+
+    # 19x19
+    for i in range(5):
+        tmp = depthwise_separable(tmp, 512, 512, 512, 1, scale)
+    module11 = tmp
+    tmp = depthwise_separable(tmp, 512, 1024, 512, 2, scale)
+
+    # 10x10
+    module13 = depthwise_separable(tmp, 1024, 1024, 1024, 1, scale)
+    module14 = extra_block(module13, 256, 512, 1, 2, scale)
+    # 5x5
+    module15 = extra_block(module14, 128, 256, 1, 2, scale)
+    # 3x3
+    module16 = extra_block(module15, 128, 256, 1, 2, scale)
+    # 2x2
+    module17 = extra_block(module16, 64, 128, 1, 2, scale)
+
+    mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head(
+        inputs=[module11, module13, module14, module15, module16, module17],
+        image=img,
+        num_classes=num_classes,
+        min_ratio=20,
+        max_ratio=90,
+        min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0],
+        max_sizes=[[], 150.0, 195.0, 240.0, 285.0, 300.0],
+        aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]],
+        base_size=img_shape[2],
+        offset=0.5,
+        flip=True)
+
+    return mbox_locs, mbox_confs, box, box_var
diff --git a/object_detection/reader.py b/object_detection/reader.py
new file mode 100644
index 00000000..4ccf3326
--- /dev/null
+++ b/object_detection/reader.py
@@ -0,0 +1,355 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import image_util
+from paddle.utils.image_util import *
+import random
+from PIL import Image
+from PIL import ImageDraw
+import numpy as np
+import xml.etree.ElementTree
+import os
+import time
+import copy
+
+
+class Settings(object):
+    def __init__(self,
+                 dataset=None,
+                 data_dir=None,
+                 label_file=None,
+                 resize_h=300,
+                 resize_w=300,
+                 mean_value=[127.5, 127.5, 127.5],
+                 apply_distort=True,
+                 apply_expand=True,
+                 toy=0):
+        self._dataset = dataset
+        self._toy = toy
+        self._data_dir = data_dir
+        if dataset == "pascalvoc":
+            self._label_list = []
+            label_fpath = os.path.join(data_dir, label_file)
+            for line in open(label_fpath):
+                self._label_list.append(line.strip())
+
+        self._apply_distort = apply_distort
+        self._apply_expand = apply_expand
+        self._resize_height = resize_h
+        self._resize_width = resize_w
+        self._img_mean = np.array(mean_value)[:, np.newaxis,
+                                              np.newaxis].astype('float32')
+        self._expand_prob = 0.5
+        self._expand_max_ratio = 4
+        self._hue_prob = 0.5
+        self._hue_delta = 18
+        self._contrast_prob = 0.5
+        self._contrast_delta = 0.5
+        self._saturation_prob = 0.5
+        self._saturation_delta = 0.5
+        self._brightness_prob = 0.5
+        self._brightness_delta = 0.125
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    @property
+    def toy(self):
+        return self._toy
+
+    @property
+    def apply_distort(self):
+        return self._apply_expand
+
+    @property
+    def apply_distort(self):
+        return self._apply_distort
+
+    @property
+    def data_dir(self):
+        return self._data_dir
+
+    @data_dir.setter
+    def data_dir(self, data_dir):
+        self._data_dir = data_dir
+
+    @property
+    def label_list(self):
+        return self._label_list
+
+    @property
+    def resize_h(self):
+        return self._resize_height
+
+    @property
+    def resize_w(self):
+        return self._resize_width
+
+    @property
+    def img_mean(self):
+        return self._img_mean
+
+
+def preprocess(img, bbox_labels, mode, settings):
+    img_width, img_height = img.size
+    sampled_labels = bbox_labels
+    if mode == 'train':
+        if settings._apply_distort:
+            img = image_util.distort_image(img, settings)
+        if settings._apply_expand:
+            img, bbox_labels, img_width, img_height = image_util.expand_image(
+                img, bbox_labels, img_width, img_height, settings)
+        # sampling
+        batch_sampler = []
+        # hard-code here
+        batch_sampler.append(
+            image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0))
+        sampled_bbox = image_util.generate_batch_samples(batch_sampler,
+                                                         bbox_labels)
+
+        img = np.array(img)
+        if len(sampled_bbox) > 0:
+            idx = int(random.uniform(0, len(sampled_bbox)))
+            img, sampled_labels = image_util.crop_image(
+                img, bbox_labels, sampled_bbox[idx], img_width, img_height)
+
+        img = Image.fromarray(img)
+    img = img.resize((settings.resize_w, settings.resize_h), Image.ANTIALIAS)
+    img = np.array(img)
+
+    if mode == 'train':
+        mirror = int(random.uniform(0, 2))
+        if mirror == 1:
+            img = img[:, ::-1, :]
+            for i in xrange(len(sampled_labels)):
+                tmp = sampled_labels[i][1]
+                sampled_labels[i][1] = 1 - sampled_labels[i][3]
+                sampled_labels[i][3] = 1 - tmp
+    # HWC to CHW
+    if len(img.shape) == 3:
+        img = np.swapaxes(img, 1, 2)
+        img = np.swapaxes(img, 1, 0)
+    # RBG to BGR
+    img = img[[2, 1, 0], :, :]
+    img = img.astype('float32')
+    img -= settings.img_mean
+    img = img * 0.007843
+    return img, sampled_labels
+
+
+def coco(settings, file_list, mode, shuffle):
+    # cocoapi
+    from pycocotools.coco import COCO
+    from pycocotools.cocoeval import COCOeval
+
+    coco = COCO(file_list)
+    image_ids = coco.getImgIds()
+    images = coco.loadImgs(image_ids)
+    category_ids = coco.getCatIds()
+    category_names = [item['name'] for item in coco.loadCats(category_ids)]
+
+    if not settings.toy == 0:
+        images = images[:settings.toy] if len(
+            images) > settings.toy else images
+    print("{} on {} with {} images".format(mode, settings.dataset, len(
+        images)))
+
+    def reader():
+        if mode == 'train' and shuffle:
+            random.shuffle(images)
+        for image in images:
+            image_name = image['file_name']
+            image_path = os.path.join(settings.data_dir, image_name)
+
+            im = Image.open(image_path)
+            if im.mode == 'L':
+                im = im.convert('RGB')
+            im_width, im_height = im.size
+
+            # layout: category_id | xmin | ymin | xmax | ymax | iscrowd |
+            # origin_coco_bbox | segmentation | area | image_id | annotation_id
+            bbox_labels = []
+            annIds = coco.getAnnIds(imgIds=image['id'])
+            anns = coco.loadAnns(annIds)
+            for ann in anns:
+                bbox_sample = []
+                # start from 1, leave 0 to background
+                bbox_sample.append(
+                    float(category_ids.index(ann['category_id'])) + 1)
+                bbox = ann['bbox']
+                xmin, ymin, w, h = bbox
+                xmax = xmin + w
+                ymax = ymin + h
+                bbox_sample.append(float(xmin) / im_width)
+                bbox_sample.append(float(ymin) / im_height)
+                bbox_sample.append(float(xmax) / im_width)
+                bbox_sample.append(float(ymax) / im_height)
+                bbox_sample.append(float(ann['iscrowd']))
+                bbox_labels.append(bbox_sample)
+            im, sample_labels = preprocess(im, bbox_labels, mode, settings)
+            sample_labels = np.array(sample_labels)
+            if len(sample_labels) == 0: continue
+            im = im.astype('float32')
+            boxes = sample_labels[:, 1:5]
+            lbls = sample_labels[:, 0].astype('int32')
+            difficults = sample_labels[:, -1].astype('int32')
+            yield im, boxes, lbls, difficults
+
+    return reader
+
+
+def pascalvoc(settings, file_list, mode, shuffle):
+    flist = open(file_list)
+    images = [line.strip() for line in flist]
+    if not settings.toy == 0:
+        images = images[:settings.toy] if len(
+            images) > settings.toy else images
+    print("{} on {} with {} images".format(mode, settings.dataset, len(
+        images)))
+
+    def reader():
+        if mode == 'train' and shuffle:
+            random.shuffle(images)
+        for image in images:
+            image_path, label_path = image.split()
+            image_path = os.path.join(settings.data_dir, image_path)
+            label_path = os.path.join(settings.data_dir, label_path)
+
+            im = Image.open(image_path)
+            if im.mode == 'L':
+                im = im.convert('RGB')
+            im_width, im_height = im.size
+
+            # layout: label | xmin | ymin | xmax | ymax | difficult
+            bbox_labels = []
+            root = xml.etree.ElementTree.parse(label_path).getroot()
+            for object in root.findall('object'):
+                bbox_sample = []
+                # start from 1
+                bbox_sample.append(
+                    float(
+                        settings.label_list.index(object.find('name').text)))
+                bbox = object.find('bndbox')
+                difficult = float(object.find('difficult').text)
+                bbox_sample.append(float(bbox.find('xmin').text) / im_width)
+                bbox_sample.append(float(bbox.find('ymin').text) / im_height)
+                bbox_sample.append(float(bbox.find('xmax').text) / im_width)
+                bbox_sample.append(float(bbox.find('ymax').text) / im_height)
+                bbox_sample.append(difficult)
+                bbox_labels.append(bbox_sample)
+            im, sample_labels = preprocess(im, bbox_labels, mode, settings)
+            sample_labels = np.array(sample_labels)
+            if len(sample_labels) == 0: continue
+            im = im.astype('float32')
+            boxes = sample_labels[:, 1:5]
+            lbls = sample_labels[:, 0].astype('int32')
+            difficults = sample_labels[:, -1].astype('int32')
+            yield im, boxes, lbls, difficults
+
+    return reader
+
+
+def draw_bounding_box_on_image(image,
+                               sample_labels,
+                               image_name,
+                               category_names,
+                               color='red',
+                               thickness=4,
+                               with_text=True,
+                               normalized=True):
+    image = Image.fromarray(image)
+    draw = ImageDraw.Draw(image)
+    im_width, im_height = image.size
+    if not normalized:
+        im_width, im_height = 1, 1
+    for item in sample_labels:
+        label = item[0]
+        category_name = category_names[int(label)]
+        bbox = item[1:5]
+        xmin, ymin, xmax, ymax = bbox
+        (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
+                                      ymin * im_height, ymax * im_height)
+        draw.line(
+            [(left, top), (left, bottom), (right, bottom), (right, top),
+             (left, top)],
+            width=thickness,
+            fill=color)
+        if with_text:
+            if image.mode == 'RGB':
+                draw.text((left, top), category_name, (255, 255, 0))
+    image.save(image_name)
+
+
+def train(settings, file_list, shuffle=True):
+    file_list = os.path.join(settings.data_dir, file_list)
+    if settings.dataset == 'coco':
+        train_settings = copy.copy(settings)
+        if '2014' in file_list:
+            sub_dir = "train2014"
+        elif '2017' in file_list:
+            sub_dir = "train2017"
+        train_settings.data_dir = os.path.join(settings.data_dir, sub_dir)
+        return coco(train_settings, file_list, 'train', shuffle)
+    else:
+        return pascalvoc(settings, file_list, 'train', shuffle)
+
+
+def test(settings, file_list):
+    file_list = os.path.join(settings.data_dir, file_list)
+    if settings.dataset == 'coco':
+        test_settings = copy.copy(settings)
+        if '2014' in file_list:
+            sub_dir = "val2014"
+        elif '2017' in file_list:
+            sub_dir = "val2017"
+        test_settings.data_dir = os.path.join(settings.data_dir, sub_dir)
+        return coco(test_settings, file_list, 'test', False)
+    else:
+        return pascalvoc(settings, file_list, 'test', False)
+
+
+def infer(settings, image_path):
+    def reader():
+        im = Image.open(image_path)
+        if im.mode == 'L':
+            im = im.convert('RGB')
+        im_width, im_height = im.size
+        img = img.resize((settings.resize_w, settings.resize_h),
+                         Image.ANTIALIAS)
+        img = np.array(img)
+        # HWC to CHW
+        if len(img.shape) == 3:
+            img = np.swapaxes(img, 1, 2)
+            img = np.swapaxes(img, 1, 0)
+        # RBG to BGR
+        img = img[[2, 1, 0], :, :]
+        img = img.astype('float32')
+        img -= settings.img_mean
+        img = img * 0.007843
+        yield img
+
+    return reader
diff --git a/object_detection/run.xsh b/object_detection/run.xsh
new file mode 100755
index 00000000..8783d96f
--- /dev/null
+++ b/object_detection/run.xsh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+cudaid=${object_detection_cudaid:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+#if [ ! -d "data/pascalvoc" ];then
+#    mkdir -p data/pascalvoc
+#    ./download.sh
+#fi
+FLAGS_benchmark=true  python train.py --batch_size=64 --num_passes=2
+cudaid=${object_detection_multi_cudaid:=0,1,2,3} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+FLAGS_benchmark=true  python train.py --batch_size=64 --num_passes=2 --gpu_card_num=4
+
diff --git a/object_detection/train.py b/object_detection/train.py
new file mode 100644
index 00000000..7846cbe8
--- /dev/null
+++ b/object_detection/train.py
@@ -0,0 +1,369 @@
+import os
+import time
+import numpy as np
+import argparse
+import functools
+import shutil
+
+import paddle as paddle
+import paddle.fluid as fluid
+import reader
+from mobilenet_ssd import mobile_net
+from utility import add_arguments, print_arguments
+
+from continuous_evaluation import train_cost_kpi, train_speed_kpi, four_card_speed_kpi
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('learning_rate',    float, 0.001,     "Learning rate.")
+add_arg('batch_size',       int,   32,        "Minibatch size.")
+add_arg('num_passes',       int,   120,       "Epoch number.")
+add_arg('iterations',       int,   120,       "mini batchs.")
+add_arg('skip_batch_num',   int,   5,       "the num of minibatch to skip.")
+add_arg('gpu_card_num',   int,   1,       "the num of gpu card.")
+add_arg('parallel',         bool,  True,      "Whether use parallel training.")
+add_arg('use_gpu',          bool,  True,      "Whether to use GPU or not.")
+add_arg('use_nccl',         bool,  True,     "Whether to use NCCL or not.")
+add_arg('dataset',          str, 'pascalvoc', "coco or pascalvoc.")
+add_arg('model_save_dir',   str, 'model',     "The path to save model.")
+add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
+add_arg('apply_distort',    bool, True,       "Whether apply distort")
+add_arg('apply_expand',     bool, True,       "Whether appley expand")
+add_arg('ap_version',       str,  '11point',  "11point or integral")
+add_arg('resize_h',         int,  300,        "The resized image height.")
+add_arg('resize_w',         int,  300,        "The resized image width.")
+add_arg('mean_value_B',     float, 127.5,     "mean value for B channel which will be subtracted")  #123.68
+add_arg('mean_value_G',     float, 127.5,     "mean value for G channel which will be subtracted")  #116.78
+add_arg('mean_value_R',     float, 127.5,     "mean value for R channel which will be subtracted")  #103.94
+add_arg('is_toy',           int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample")
+# yapf: enable
+
+
+def parallel_do(args,
+                train_file_list,
+                val_file_list,
+                data_args,
+                learning_rate,
+                batch_size,
+                num_passes,
+                model_save_dir,
+                pretrained_model=None):
+    image_shape = [3, data_args.resize_h, data_args.resize_w]
+    if data_args.dataset == 'coco':
+        num_classes = 81
+    elif data_args.dataset == 'pascalvoc':
+        num_classes = 21
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    gt_box = fluid.layers.data(
+        name='gt_box', shape=[4], dtype='float32', lod_level=1)
+    gt_label = fluid.layers.data(
+        name='gt_label', shape=[1], dtype='int32', lod_level=1)
+    difficult = fluid.layers.data(
+        name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
+
+    if args.parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl)
+        with pd.do():
+            image_ = pd.read_input(image)
+            gt_box_ = pd.read_input(gt_box)
+            gt_label_ = pd.read_input(gt_label)
+            difficult_ = pd.read_input(difficult)
+            locs, confs, box, box_var = mobile_net(num_classes, image_,
+                                                   image_shape)
+            loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box,
+                                         box_var)
+            nmsed_out = fluid.layers.detection_output(
+                locs, confs, box, box_var, nms_threshold=0.45)
+            loss = fluid.layers.reduce_sum(loss)
+            pd.write_output(loss)
+            pd.write_output(nmsed_out)
+
+        loss, nmsed_out = pd()
+        loss = fluid.layers.mean(loss)
+    else:
+        locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
+        nmsed_out = fluid.layers.detection_output(
+            locs, confs, box, box_var, nms_threshold=0.45)
+        loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
+                                     box_var)
+        loss = fluid.layers.reduce_sum(loss)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+    with fluid.program_guard(test_program):
+        map_eval = fluid.evaluator.DetectionMAP(
+            nmsed_out,
+            gt_label,
+            gt_box,
+            difficult,
+            num_classes,
+            overlap_threshold=0.5,
+            evaluate_difficult=False,
+            ap_version=args.ap_version)
+
+    if data_args.dataset == 'coco':
+        # learning rate decay in 12, 19 pass, respectively
+        if '2014' in train_file_list:
+            boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19]
+        elif '2017' in train_file_list:
+            boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19]
+    elif data_args.dataset == 'pascalvoc':
+        boundaries = [40000, 60000]
+    values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25]
+    optimizer = fluid.optimizer.RMSProp(
+        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
+        regularization=fluid.regularizer.L2Decay(0.00005), )
+
+    optimizer.minimize(loss)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
+    train_reader = paddle.batch(
+        reader.train(data_args, train_file_list), batch_size=batch_size)
+    test_reader = paddle.batch(
+        reader.test(data_args, val_file_list), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        place=place, feed_list=[image, gt_box, gt_label, difficult])
+
+    def test(pass_id):
+        _, accum_map = map_eval.get_map_var()
+        map_eval.reset(exe)
+        test_map = None
+        for data in test_reader():
+            test_map = exe.run(test_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[accum_map])
+        print("Test {0}, map {1}".format(pass_id, test_map[0]))
+
+    for pass_id in range(num_passes):
+        start_time = time.time()
+        prev_start_time = start_time
+        end_time = 0
+        for batch_id, data in enumerate(train_reader()):
+            prev_start_time = start_time
+            start_time = time.time()
+            loss_v = exe.run(fluid.default_main_program(),
+                             feed=feeder.feed(data),
+                             fetch_list=[loss])
+            end_time = time.time()
+            if batch_id % 20 == 0:
+                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
+                    pass_id, batch_id, loss_v[0],
+                    start_time - prev_start_time))
+        test(pass_id)
+
+        if pass_id % 10 == 0 or pass_id == num_passes - 1:
+            model_path = os.path.join(model_save_dir, str(pass_id))
+            print 'save models to %s' % (model_path)
+            fluid.io.save_persistables(exe, model_path)
+
+
+def parallel_exe(args,
+                 train_file_list,
+                 val_file_list,
+                 data_args,
+                 learning_rate,
+                 batch_size,
+                 num_passes,
+                 model_save_dir='model',
+                 pretrained_model=None):
+    image_shape = [3, data_args.resize_h, data_args.resize_w]
+    if data_args.dataset == 'coco':
+        num_classes = 81
+    elif data_args.dataset == 'pascalvoc':
+        num_classes = 21
+
+    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
+    devices_num = len(devices.split(","))
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    gt_box = fluid.layers.data(
+        name='gt_box', shape=[4], dtype='float32', lod_level=1)
+    gt_label = fluid.layers.data(
+        name='gt_label', shape=[1], dtype='int32', lod_level=1)
+    difficult = fluid.layers.data(
+        name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
+
+    locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
+    nmsed_out = fluid.layers.detection_output(
+        locs, confs, box, box_var, nms_threshold=0.45)
+    loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var)
+    loss = fluid.layers.reduce_sum(loss)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+    with fluid.program_guard(test_program):
+        map_eval = fluid.evaluator.DetectionMAP(
+            nmsed_out,
+            gt_label,
+            gt_box,
+            difficult,
+            num_classes,
+            overlap_threshold=0.5,
+            evaluate_difficult=False,
+            ap_version=args.ap_version)
+
+    if data_args.dataset == 'coco':
+        # learning rate decay in 12, 19 pass, respectively
+        if '2014' in train_file_list:
+            epocs = 82783 / batch_size
+            boundaries = [epocs * 12, epocs * 19]
+        elif '2017' in train_file_list:
+            epocs = 118287 / batch_size
+            boundaries = [epcos * 12, epocs * 19]
+    elif data_args.dataset == 'pascalvoc':
+        epocs = 19200 / batch_size
+        boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
+    values = [
+        learning_rate, learning_rate * 0.5, learning_rate * 0.25,
+        learning_rate * 0.1, learning_rate * 0.01
+    ]
+    optimizer = fluid.optimizer.RMSProp(
+        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
+        regularization=fluid.regularizer.L2Decay(0.00005), )
+
+    optimizer.minimize(loss)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    fluid.default_startup_program.random_seed = 1000
+    exe.run(fluid.default_startup_program())
+
+    if pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
+    if args.parallel:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=args.use_gpu, loss_name=loss.name)
+
+    train_reader = paddle.batch(
+        reader.train(data_args, train_file_list), batch_size=batch_size)
+    test_reader = paddle.batch(
+        reader.test(data_args, val_file_list), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        place=place, feed_list=[image, gt_box, gt_label, difficult])
+
+    def save_model(postfix):
+        model_path = os.path.join(model_save_dir, postfix)
+        if os.path.isdir(model_path):
+            shutil.rmtree(model_path)
+        print 'save models to %s' % (model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+    best_map = 0.
+
+    def test(pass_id, best_map):
+        _, accum_map = map_eval.get_map_var()
+        map_eval.reset(exe)
+        test_map = None
+        for data in test_reader():
+            test_map = exe.run(test_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[accum_map])
+        if test_map[0] > best_map:
+            best_map = test_map[0]
+            save_model('best_model')
+        print("Test {0}, map {1}".format(pass_id, test_map[0]))
+
+    train_num = 0
+    total_train_time = 0.0
+    total_iters = 0
+    for pass_id in range(num_passes):
+        every_pass_loss = []
+        iter = 0
+        pass_duration = 0.0
+        for batch_id, data in enumerate(train_reader()):
+            batch_start = time.time()
+            if iter == args.iterations:
+                break
+            if len(data) < devices_num: continue
+            if args.parallel:
+                loss_v, = train_exe.run(fetch_list=[loss.name],
+                                        feed=feeder.feed(data))
+            else:
+                loss_v, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[loss])
+            loss_v = np.mean(np.array(loss_v))
+            if batch_id % 20 == 0:
+                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
+                    pass_id, batch_id, loss_v, time.time() - batch_start))
+            if iter >= args.skip_batch_num or pass_id != 0:
+                batch_duration = time.time() - batch_start
+                pass_duration += batch_duration
+                train_num += len(data)
+            every_pass_loss.append(loss_v)
+            iter += 1
+            total_iters += 1
+    #test(pass_id, best_map)
+        total_train_time += pass_duration
+        print("Pass:%d, Loss:%f, Handle Images Duration: %f\n" %
+              (pass_id, np.mean(every_pass_loss), pass_duration))
+        if pass_id == num_passes - 1:
+            examples_per_sec = train_num / total_train_time
+            train_cost_kpi.add_record(np.mean(every_pass_loss))
+            train_speed_kpi.add_record(
+                np.array(
+                    examples_per_sec, dtype='float'))
+            four_card_speed_kpi.add_record(
+                np.array(
+                    examples_per_sec, dtype='float'))
+    if args.gpu_card_num == 1:
+        train_cost_kpi.persist()
+        train_speed_kpi.persist()
+    else:
+        four_card_speed_kpi.persist()
+    print("Best test map {0}".format(best_map))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    data_dir = '/data/pascalvoc'
+    train_file_list = 'trainval.txt'
+    val_file_list = 'test.txt'
+    label_file = 'label_list'
+    model_save_dir = args.model_save_dir
+    if args.dataset == 'coco':
+        data_dir = './data/COCO17'
+        train_file_list = 'annotations/instances_train2017.json'
+        val_file_list = 'annotations/instances_val2017.json'
+        label_file = 'label_list'
+
+    data_args = reader.Settings(
+        dataset=args.dataset,
+        data_dir=data_dir,
+        label_file=label_file,
+        apply_distort=args.apply_distort,
+        apply_expand=args.apply_expand,
+        resize_h=args.resize_h,
+        resize_w=args.resize_w,
+        mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R],
+        toy=args.is_toy)
+    #method = parallel_do
+    method = parallel_exe
+    method(
+        args,
+        train_file_list=train_file_list,
+        val_file_list=val_file_list,
+        data_args=data_args,
+        learning_rate=args.learning_rate,
+        batch_size=args.batch_size,
+        num_passes=args.num_passes,
+        model_save_dir=model_save_dir,
+        pretrained_model=args.pretrained_model)
diff --git a/object_detection/utility.py b/object_detection/utility.py
new file mode 100644
index 00000000..506e6007
--- /dev/null
+++ b/object_detection/utility.py
@@ -0,0 +1,62 @@
+"""Contains common utility functions."""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import numpy as np
+from paddle.fluid import core
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).iteritems()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
diff --git a/resnet30/continuous_evaluation.py b/resnet30/continuous_evaluation.py
deleted file mode 100644
index 283fcf48..00000000
--- a/resnet30/continuous_evaluation.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import os
-import sys
-sys.path.append(os.environ['ceroot'])
-from kpi import CostKpi, DurationKpi
-
-train_cost_kpi = CostKpi('train_cost', 0.01)
-train_duration_kpi = DurationKpi('train_duration', 0.04)
-
-tracking_kpis = [
-    train_cost_kpi,
-    train_duration_kpi,
-]
diff --git a/resnet30/latest_kpis/train_cost_factor.txt b/resnet30/latest_kpis/train_cost_factor.txt
deleted file mode 100644
index 040e98ef..00000000
--- a/resnet30/latest_kpis/train_cost_factor.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-[[100.0]]
-[[100.0]]
-[[100.0]]
-[[100.0]]
-[[100.0]]
-[[100.0]]
-[[100.0]]
-[[100.0]]
-[[100.0]]
-[[100.0]]
diff --git a/resnet30/latest_kpis/train_duration_factor.txt b/resnet30/latest_kpis/train_duration_factor.txt
deleted file mode 100644
index 13bd6f8b..00000000
--- a/resnet30/latest_kpis/train_duration_factor.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-[1000.0]
-[1000.0]
-[1000.0]
-[1000.0]
-[1000.0]
-[1000.0]
-[1000.0]
-[1000.0]
-[1000.0]
-[1000.0]
diff --git a/resnet50/continuous_evaluation.py b/resnet50/continuous_evaluation.py
index 4fa0d179..b4eed6d2 100644
--- a/resnet50/continuous_evaluation.py
+++ b/resnet50/continuous_evaluation.py
@@ -3,12 +3,12 @@
 sys.path.append(os.environ['ceroot'])
 from kpi import CostKpi, DurationKpi, AccKpi
 
-cifar10_128_train_acc_kpi = AccKpi('cifar10_128_train_acc', 0.05, 0)
-cifar10_128_train_speed_kpi = AccKpi('cifar10_128_train_speed', 0.05, 0)
-cifar10_128_gpu_memory_kpi = DurationKpi('cifar10_128_gpu_memory', 0.05, 0)
+cifar10_128_train_acc_kpi = AccKpi('cifar10_128_train_acc', 0.03, 0, actived=True)
+cifar10_128_train_speed_kpi = AccKpi('cifar10_128_train_speed', 0.06, 0, actived=True)
+cifar10_128_gpu_memory_kpi = DurationKpi('cifar10_128_gpu_memory', 0.1, 0, actived=True)
 
-flowers_64_train_speed_kpi = AccKpi('flowers_64_train_speed', 0.05, 0)
-flowers_64_gpu_memory_kpi = DurationKpi('flowers_64_gpu_memory', 0.05, 0)
+flowers_64_train_speed_kpi = AccKpi('flowers_64_train_speed', 0.05, 0, actived=True)
+flowers_64_gpu_memory_kpi = DurationKpi('flowers_64_gpu_memory', 0.1, 0, actived=True)
 
 tracking_kpis = [
     cifar10_128_train_acc_kpi,
diff --git a/resnet50/get_gpu_data.py b/resnet50/get_gpu_data.py
index 83e57e34..1e391253 100644
--- a/resnet50/get_gpu_data.py
+++ b/resnet50/get_gpu_data.py
@@ -5,7 +5,6 @@
 # Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved
 # 
 ########################################################################
- 
 """
 File: get_gpu_data.py
 Author: paddle(paddle@baidu.com)
@@ -14,7 +13,6 @@
 import argparse
 from continuous_evaluation import tracking_kpis
 
-
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
     '--batch_size', type=int, default=128, help="Batch size for training.")
@@ -26,20 +24,21 @@
     help='Optional dataset for benchmark.')
 args = parser.parse_args()
 
+
 def save_gpu_data():
     mem_list = []
-    with open('mem.log', 'r') as f:
-        for i , data in enumerate(f.readlines()):
+    with open('memory.txt', 'r') as f:
+        for i, data in enumerate(f.readlines()):
             if i == 0:
                 continue
             mem_list.append(int(data.split("\n")[0].split(" ")[0]))
     gpu_memory_factor = None
-    for kpi in tracking_kpis: 
+    for kpi in tracking_kpis:
         if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size):
             gpu_memory_kpi = kpi
     gpu_memory_kpi.add_record(max(mem_list))
     gpu_memory_kpi.persist()
 
+
 if __name__ == "__main__":
     save_gpu_data()
-
diff --git a/resnet50/latest_kpis/cifar10_128_gpu_memory_factor.txt b/resnet50/latest_kpis/cifar10_128_gpu_memory_factor.txt
index 466467aa..5ebe01c2 100644
--- a/resnet50/latest_kpis/cifar10_128_gpu_memory_factor.txt
+++ b/resnet50/latest_kpis/cifar10_128_gpu_memory_factor.txt
@@ -1 +1 @@
-[1508]
\ No newline at end of file
+[1394]
diff --git a/resnet50/latest_kpis/cifar10_128_train_acc_factor.txt b/resnet50/latest_kpis/cifar10_128_train_acc_factor.txt
index c276983e..83208824 100644
--- a/resnet50/latest_kpis/cifar10_128_train_acc_factor.txt
+++ b/resnet50/latest_kpis/cifar10_128_train_acc_factor.txt
@@ -1 +1 @@
-[0.99755859375]
\ No newline at end of file
+[0.93755859375]
diff --git a/resnet50/latest_kpis/cifar10_128_train_speed_factor.txt b/resnet50/latest_kpis/cifar10_128_train_speed_factor.txt
index c4fe04e3..f37998fd 100644
--- a/resnet50/latest_kpis/cifar10_128_train_speed_factor.txt
+++ b/resnet50/latest_kpis/cifar10_128_train_speed_factor.txt
@@ -1 +1 @@
-[404.4730529785156]
\ No newline at end of file
+[738.095703125]
diff --git a/resnet50/latest_kpis/flowers_64_gpu_memory_factor.txt b/resnet50/latest_kpis/flowers_64_gpu_memory_factor.txt
index 24c95b88..2799deaf 100644
--- a/resnet50/latest_kpis/flowers_64_gpu_memory_factor.txt
+++ b/resnet50/latest_kpis/flowers_64_gpu_memory_factor.txt
@@ -1 +1 @@
-[11014]
\ No newline at end of file
+[10352]
diff --git a/resnet50/latest_kpis/flowers_64_train_speed_factor.txt b/resnet50/latest_kpis/flowers_64_train_speed_factor.txt
index 8585524f..4938eeb8 100644
--- a/resnet50/latest_kpis/flowers_64_train_speed_factor.txt
+++ b/resnet50/latest_kpis/flowers_64_train_speed_factor.txt
@@ -1 +1 @@
-[78.7945785522461]
\ No newline at end of file
+[106.87747192382812]
diff --git a/resnet50/model.py b/resnet50/model.py
index 91977d26..f4e7beea 100644
--- a/resnet50/model.py
+++ b/resnet50/model.py
@@ -7,6 +7,7 @@
 import numpy as np
 import time
 import commands
+import subprocess
 import threading
 
 import cProfile
@@ -90,8 +91,8 @@ def parse_args():
 
 
 def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof']
-                                and vars(args)['device'] == 'GPU')
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
     print('-----------  Configuration Arguments -----------')
     for arg, value in sorted(vars(args).iteritems()):
         print('%s: %s' % (arg, value))
@@ -282,14 +283,15 @@ def test(exe):
             if iter == args.iterations:
                 break
             if not args.use_fake_data:
-                image = np.array(
-                    map(lambda x: x[0].reshape(dshape), data)).astype('float32')
+                image = np.array(map(lambda x: x[0].reshape(dshape),
+                                     data)).astype('float32')
                 label = np.array(map(lambda x: x[1], data)).astype('int64')
                 label = label.reshape([-1, 1])
             loss, acc, weight = exe.run(
-                fluid.default_main_program(), feed={
-                    'data': image, 'label': label}, fetch_list=[
-                    avg_cost, batch_acc, batch_size_tensor])
+                fluid.default_main_program(),
+                feed={'data': image,
+                      'label': label},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
             accuracy.add(value=acc, weight=weight)
             if iter >= args.skip_batch_num or pass_id != 0:
                 batch_duration = time.time() - batch_start
@@ -305,8 +307,9 @@ def test(exe):
         pass_train_acc = accuracy.eval()
         pass_test_acc = test(exe)
         print(
-            "Pass:%d, Loss:%f, Train Accuray:%f, Test Accuray:%f, Handle Images Duration: %f\n" %
-            (pass_id, np.mean(every_pass_loss), pass_train_acc, pass_test_acc, pass_duration))
+            "Pass:%d, Loss:%f, Train Accuray:%f, Test Accuray:%f, Handle Images Duration: %f\n"
+            % (pass_id, np.mean(every_pass_loss), pass_train_acc,
+               pass_test_acc, pass_duration))
     if pass_id == args.pass_num - 1 and args.data_set == 'cifar10':
         train_acc_kpi.add_record(np.array(pass_train_acc, dtype='float32'))
         train_acc_kpi.persist()
@@ -317,9 +320,8 @@ def test(exe):
         train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32'))
     train_speed_kpi.persist()
 
-    print(
-        '\nTotal examples: %d, total time: %.5f' %
-        (im_num, total_train_time))
+    print('\nTotal examples: %d, total time: %.5f' %
+          (im_num, total_train_time))
     print('%.5f examples/sec, %.5f sec/batch \n' %
           (examples_per_sec, sec_per_batch))
 
@@ -332,23 +334,26 @@ def test(exe):
         print(s.getvalue())
 
 
-def collect_gpu_memory_data(mem_list):
+def collect_gpu_memory_data(alive):
     """
     collect the GPU memory data
     """
-    while(True):
-        command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv" % args.gpu_id
-        status, output = commands.getstatusoutput(command)
-        if status != 0:
-            print('Get GPU memory data error')
-        else:
-            mem_list.append(int(output.split('\n')[1].split(' ')[0]))
+    global is_alive
+    status, output = commands.getstatusoutput('rm -rf memory.txt')
+    if status == 0:
+        print('del memory.txt')
+    command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id
+    p = subprocess.Popen(command, shell=True)
+    if p.pid < 0:
+        print('Get GPU memory data error')
+    while (is_alive):
         time.sleep(1)
+    p.kill()
 
 
 def save_gpu_data(mem_list):
     gpu_memory_kpi = None
-    for kpi in tracking_kpis: 
+    for kpi in tracking_kpis:
         if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size):
             gpu_memory_kpi = kpi
     gpu_memory_kpi.add_record(max(mem_list))
@@ -362,12 +367,13 @@ def save_gpu_data(mem_list):
     }
     args = parse_args()
     print_arguments(args)
+    global is_alive
+    is_alive = True
     if args.data_format == 'NHWC':
         raise ValueError('Only support NCHW data_format now.')
-    mem_data_list = []
     if args.device == 'GPU':
         collect_memory_thread = threading.Thread(
-            target=collect_gpu_memory_data, args=(mem_data_list,))
+            target=collect_gpu_memory_data, args=(is_alive, ))
         collect_memory_thread.setDaemon(True)
         collect_memory_thread.start()
     if args.use_nvprof and args.device == 'GPU':
@@ -375,4 +381,4 @@ def save_gpu_data(mem_list):
             run_benchmark(model_map[args.model], args)
     else:
         run_benchmark(model_map[args.model], args)
-        save_gpu_data(mem_data_list)
+        is_alive = False
diff --git a/resnet50/run.xsh b/resnet50/run.xsh
index 81f7847f..04764927 100755
--- a/resnet50/run.xsh
+++ b/resnet50/run.xsh
@@ -7,6 +7,12 @@ export CUDA_VISIBLE_DEVICES=$cudaid
 
 # cifar10 128
 FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=128 --data_set=cifar10 --model=resnet_cifar10 --pass_num=30 --gpu_id=$cudaid
+python get_gpu_data.py --batch_size=128 --data_set=cifar10
 
 #flowers 64
 FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=64 --data_set=flowers --model=resnet_imagenet --pass_num=3 --gpu_id=$cudaid
+python get_gpu_data.py --batch_size=64 --data_set=flowers
+for pid in $(ps -ef | grep nvidia-smi | grep -v grep | cut -c 9-15); do
+    echo $pid
+    kill -9 $pid
+done
\ No newline at end of file
diff --git a/seq2seq/continuous_evaluation.py b/seq2seq/continuous_evaluation.py
new file mode 100644
index 00000000..191f2c63
--- /dev/null
+++ b/seq2seq/continuous_evaluation.py
@@ -0,0 +1,17 @@
+"""
+continuous_evaluation.py
+"""
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import AccKpi
+from kpi import CostKpi
+from kpi import DurationKpi
+
+wmb_128_train_speed_kpi = AccKpi('wmb_128_train_speed', 0.2, 0)
+wmb_128_gpu_memory_kpi = DurationKpi('wmb_128_gpu_memory', 0.2, 0)
+
+tracking_kpis = [
+    wmb_128_train_speed_kpi,
+    wmb_128_gpu_memory_kpi,
+]
diff --git a/seq2seq/get_gpu_data.py b/seq2seq/get_gpu_data.py
new file mode 100644
index 00000000..c852351d
--- /dev/null
+++ b/seq2seq/get_gpu_data.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+########################################################################
+# 
+# Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved
+# 
+########################################################################
+"""
+File: get_gpu_data.py
+Author: paddle(paddle@baidu.com)
+Date: 2018/04/02 15:57:14
+"""
+import argparse
+from continuous_evaluation import tracking_kpis
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='wmb',
+    help='Optional dataset for benchmark.')
+args = parser.parse_args()
+
+
+def save_gpu_data():
+    mem_list = []
+    with open('memory.txt', 'r') as f:
+        for i, data in enumerate(f.readlines()):
+            if i == 0:
+                continue
+            mem_list.append(int(data.split("\n")[0].split(" ")[0]))
+    gpu_memory_factor = None
+    for kpi in tracking_kpis:
+        if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size):
+            gpu_memory_kpi = kpi
+    gpu_memory_kpi.add_record(max(mem_list))
+    gpu_memory_kpi.persist()
+
+
+if __name__ == "__main__":
+    save_gpu_data()
diff --git a/seq2seq/latest_kpis/wmb_128_gpu_memory_factor.txt b/seq2seq/latest_kpis/wmb_128_gpu_memory_factor.txt
new file mode 100644
index 00000000..e5d1e87f
--- /dev/null
+++ b/seq2seq/latest_kpis/wmb_128_gpu_memory_factor.txt
@@ -0,0 +1 @@
+[6976]
diff --git a/seq2seq/latest_kpis/wmb_128_train_speed_factor.txt b/seq2seq/latest_kpis/wmb_128_train_speed_factor.txt
new file mode 100644
index 00000000..f845312b
--- /dev/null
+++ b/seq2seq/latest_kpis/wmb_128_train_speed_factor.txt
@@ -0,0 +1 @@
+[4430.63330078125]
diff --git a/seq2seq/model.py b/seq2seq/model.py
new file mode 100644
index 00000000..7c0db4e0
--- /dev/null
+++ b/seq2seq/model.py
@@ -0,0 +1,421 @@
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+import commands
+import subprocess
+import threading
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+from continuous_evaluation import tracking_kpis
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=16,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=2,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    "--gpu_id",
+    type=int,
+    default=3,
+    help="The GPU Card Id. (default: %(default)d)")
+parser.add_argument(
+    "--max_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act=None,
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act=None,
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    def lstm_decoder_with_attention(target_embedding, encoder_vec,
+                                    encoder_proj, decoder_boot, decoder_size):
+        def simple_attention(encoder_vec, encoder_proj, decoder_state):
+            decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                                 size=decoder_size,
+                                                 bias_attr=False)
+            decoder_state_expand = fluid.layers.sequence_expand(
+                x=decoder_state_proj, y=encoder_proj)
+            concated = fluid.layers.concat(
+                input=[encoder_proj, decoder_state_expand], axis=1)
+            attention_weights = fluid.layers.fc(input=concated,
+                                                size=1,
+                                                act='tanh',
+                                                bias_attr=False)
+            attention_weights = fluid.layers.sequence_softmax(
+                input=attention_weights)
+            weigths_reshape = fluid.layers.reshape(
+                x=attention_weights, shape=[-1])
+            scaled = fluid.layers.elementwise_mul(
+                x=encoder_vec, y=weigths_reshape, axis=0)
+            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            return context
+
+        rnn = fluid.layers.DynamicRNN()
+
+        cell_init = fluid.layers.fill_constant_batch_size_like(
+            input=decoder_boot,
+            value=0.0,
+            shape=[-1, decoder_size],
+            dtype='float32')
+        cell_init.stop_gradient = False
+
+        with rnn.block():
+            current_word = rnn.step_input(target_embedding)
+            encoder_vec = rnn.static_input(encoder_vec)
+            encoder_proj = rnn.static_input(encoder_proj)
+            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+            cell_mem = rnn.memory(init=cell_init)
+            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+            decoder_inputs = fluid.layers.concat(
+                input=[context, current_word], axis=1)
+            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem,
+                             decoder_size)
+            rnn.update_memory(hidden_mem, h)
+            rnn.update_memory(cell_mem, c)
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            rnn.output(out)
+        return rnn()
+
+    if not is_generating:
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+                                                 encoded_proj, decoder_boot,
+                                                 decoder_size)
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+        return avg_cost, feeding_list
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    lod_t = core.LoDTensor()
+    lod_t.set(flattened_data, place)
+    lod_t.set_lod([lod])
+    return lod_t, lod[-1]
+
+
+def lodtensor_to_ndarray(lod_tensor):
+    dims = lod_tensor.get_dims()
+    ndarray = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+    return ndarray
+
+
+def train():
+    avg_cost, feeding_list = seq_to_seq_net(
+        args.embedding_dim,
+        args.encoder_size,
+        args.decoder_size,
+        args.dict_size,
+        args.dict_size,
+        False,
+        beam_size=args.beam_size,
+        max_length=args.max_length)
+
+    # clone from default main program
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    def do_validation():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
+            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
+            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
+
+            fetch_outs = exe.run(inference_program,
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost],
+                                 return_numpy=False)
+
+            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
+            count += 1
+
+        return total_loss / count
+
+    train_acc_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == 'wmb_%s_train_acc' % (args.batch_size):
+            train_acc_kpi = kpi
+    train_speed_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == 'wmb_%s_train_speed' % (args.batch_size):
+            train_speed_kpi = kpi
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in xrange(args.pass_num):
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_batch_generator()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
+            num_samples += word_num
+            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
+            num_samples += word_num
+            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
+
+            fetch_outs = exe.run(framework.default_main_program(),
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost])
+
+            iters += 1
+            loss = np.array(fetch_outs[0])
+            print(
+                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32'))
+        if args.with_test:
+            test_loss = do_validation()
+        break
+    train_speed_kpi.persist()
+
+
+def infer():
+    pass
+
+
+def print_arguments(args):
+    print('----------- seq2seq Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def collect_gpu_memory_data(alive):
+    """
+    collect the GPU memory data
+    """
+    global is_alive
+    status, output = commands.getstatusoutput('rm -rf memory.txt')
+    if status == 0:
+        print('del memory.txt')
+    command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id
+    p = subprocess.Popen(command, shell=True)
+    if p.pid < 0:
+        print('Get GPU memory data error')
+    while (is_alive):
+        time.sleep(1)
+    p.kill()
+
+
+def save_gpu_data(mem_list):
+    gpu_memory_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == 'wmb_%s_gpu_memory' % (args.batch_size):
+            gpu_memory_kpi = kpi
+    gpu_memory_kpi.add_record(max(mem_list))
+    gpu_memory_kpi.persist()
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    global is_alive
+    is_alive = True
+    collect_memory_thread = threading.Thread(
+        target=collect_gpu_memory_data, args=(is_alive, ))
+    collect_memory_thread.setDaemon(True)
+    collect_memory_thread.start()
+    if args.infer_only:
+        infer()
+    else:
+        train()
+        is_alive = False
diff --git a/seq2seq/run.xsh b/seq2seq/run.xsh
new file mode 100755
index 00000000..2e315c0c
--- /dev/null
+++ b/seq2seq/run.xsh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+cudaid=${seq2seq_cudaid:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+#imdb 128
+FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=128 --iterations=50 --gpu_id=$cudaid
+python get_gpu_data.py --batch_size=128 --data_set=wmb
+for pid in $(ps -ef | grep nvidia-smi | grep -v grep | cut -c 9-15); do
+    echo $pid
+    kill -9 $pid
+done
diff --git a/sequence_tagging_for_ner/README.md b/sequence_tagging_for_ner/README.md
new file mode 100644
index 00000000..1f634da4
--- /dev/null
+++ b/sequence_tagging_for_ner/README.md
@@ -0,0 +1,120 @@
+# 命名实体识别
+
+以下是本例的简要目录结构及说明：
+
+```text
+.
+├── data                 # 存储运行本例所依赖的数据，从外部获取
+├── network_conf.py      # 模型定义
+├── reader.py            # 数据读取接口, 从外部获取
+├── README.md            # 文档
+├── train.py             # 训练脚本
+├── infer.py             # 预测脚本
+├── utils.py             # 定义通用的函数, 从外部获取
+└── utils_extend.py      # 对utils.py的拓展
+```
+
+
+## 简介，模型详解
+
+在PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中对于命名实体识别任务有较详细的介绍，在本例中不再重复介绍。
+在模型上，我们沿用了v2版本的模型结构，唯一区别是我们使用LSTM代替原始的RNN。
+
+## 数据获取
+
+请参考PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md) 一节中数据获取方式，将该例中的data文件夹拷贝至本例目录下，运行其中的download.sh脚本获取训练和测试数据。
+
+## 通用脚本获取
+
+请将PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中提供的用于数据读取的文件[reader.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/reader.py)以及包含字典导入等通用功能的文件[utils.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/utils.py)复制到本目录下。本例将会使用到这两个脚本。
+
+## 训练
+
+1. 运行 `sh data/download.sh`
+2. 修改 `train.py` 的 `main` 函数，指定数据路径
+
+    ```python
+    main(
+        train_data_file="data/train",
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt",
+        emb_file="data/wordVectors.txt",
+        model_save_dir="models",
+        num_passes=1000,
+        use_gpu=False,
+        parallel=False)
+    ```
+
+3. 运行命令 `python train.py` ，**需要注意：直接运行使用的是示例数据，请替换真实的标记数据。**
+
+    ```text
+    Pass 127, Batch 9525, Cost 4.0867705, Precision 0.3954984, Recall 0.37846154, F1_score0.38679245
+    Pass 127, Batch 9530, Cost 3.137265, Precision 0.42971888, Recall 0.38351256, F1_score0.405303
+    Pass 127, Batch 9535, Cost 3.6240938, Precision 0.4272152, Recall 0.41795665, F1_score0.4225352
+    Pass 127, Batch 9540, Cost 3.5352352, Precision 0.48464164, Recall 0.4536741, F1_score0.46864685
+    Pass 127, Batch 9545, Cost 4.1130385, Precision 0.40131578, Recall 0.3836478, F1_score0.39228293
+    Pass 127, Batch 9550, Cost 3.6826708, Precision 0.43333334, Recall 0.43730888, F1_score0.43531203
+    Pass 127, Batch 9555, Cost 3.6363933, Precision 0.42424244, Recall 0.3962264, F1_score0.4097561
+    Pass 127, Batch 9560, Cost 3.6101768, Precision 0.51363635, Recall 0.353125, F1_score0.41851854
+    Pass 127, Batch 9565, Cost 3.5935276, Precision 0.5152439, Recall 0.5, F1_score0.5075075
+    Pass 127, Batch 9570, Cost 3.4987144, Precision 0.5, Recall 0.4330218, F1_score0.46410686
+    Pass 127, Batch 9575, Cost 3.4659843, Precision 0.39864865, Recall 0.38064516, F1_score0.38943896
+    Pass 127, Batch 9580, Cost 3.1702557, Precision 0.5, Recall 0.4490446, F1_score0.47315437
+    Pass 127, Batch 9585, Cost 3.1587276, Precision 0.49377593, Recall 0.4089347, F1_score0.4473684
+    Pass 127, Batch 9590, Cost 3.5043538, Precision 0.4556962, Recall 0.4600639, F1_score0.45786962
+    Pass 127, Batch 9595, Cost 2.981989, Precision 0.44981414, Recall 0.45149255, F1_score0.4506518
+    [TrainSet] pass_id:127 pass_precision:[0.46023396] pass_recall:[0.43197003] pass_f1_score:[0.44565433]
+    [TestSet] pass_id:127 pass_precision:[0.4708409] pass_recall:[0.47971722] pass_f1_score:[0.4752376]
+    ```
+## 预测
+1. 修改 [infer.py](./infer.py) 的 `infer` 函数，指定：需要测试的模型的路径、测试数据、字典文件，预测标记文件的路径，默认参数如下：
+
+    ```python
+    infer(
+        model_path="models/params_pass_0",
+        batch_size=6,
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt",
+        use_gpu=False
+    )
+    ```
+
+2. 在终端运行 `python infer.py`，开始测试，会看到如下预测结果（以下为训练70个pass所得模型的部分预测结果）：
+
+    ```text
+    leicestershire    B-ORG    B-LOC
+    extended    O    O
+    their    O    O
+    first    O    O
+    innings    O    O
+    by    O    O
+    DGDG    O    O
+    runs    O    O
+    before    O    O
+    being    O    O
+    bowled    O    O
+    out    O    O
+    for    O    O
+    296    O    O
+    with    O    O
+    england    B-LOC    B-LOC
+    discard    O    O
+    andy    B-PER    B-PER
+    caddick    I-PER    I-PER
+    taking    O    O
+    three    O    O
+    for    O    O
+    DGDG    O    O
+    .    O    O
+    ```
+
+    输出分为三列，以“\t” 分隔，第一列是输入的词语，第二列是标准结果，第三列为生成的标记结果。多条输入序列之间以空行分隔。
+
+## 结果示例
+
+<p align="center">
+<img src="imgs/convergence_curve.png" width="80%" align="center"/><br/>
+图1. 学习曲线, 横轴表示训练轮数，纵轴表示F1值
+</p>
diff --git a/sequence_tagging_for_ner/continuous_evaluation.py b/sequence_tagging_for_ner/continuous_evaluation.py
new file mode 100644
index 00000000..e8e4ccd0
--- /dev/null
+++ b/sequence_tagging_for_ner/continuous_evaluation.py
@@ -0,0 +1,20 @@
+"""
+continuous_evaluation.py
+"""
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import AccKpi
+from kpi import DurationKpi
+
+train_acc_kpi = AccKpi('train_acc', 0.2, 0)
+pass_duration_kpi = DurationKpi('pass_duration', 0.02, 0, actived=True)
+train_acc_kpi_card4 = AccKpi('train_acc_card4', 0.2, 0)
+pass_duration_kpi_card4 = DurationKpi('pass_duration_card4', 0.02, 0, actived=True)
+
+tracking_kpis = [
+    train_acc_kpi,
+    pass_duration_kpi,
+    train_acc_kpi_card4,
+    pass_duration_kpi_card4,
+]
diff --git a/sequence_tagging_for_ner/data/target.txt b/sequence_tagging_for_ner/data/target.txt
new file mode 100644
index 00000000..e0fa4d8f
--- /dev/null
+++ b/sequence_tagging_for_ner/data/target.txt
@@ -0,0 +1,9 @@
+B-LOC
+I-LOC
+B-MISC
+I-MISC
+B-ORG
+I-ORG
+B-PER
+I-PER
+O
diff --git a/sequence_tagging_for_ner/data/test b/sequence_tagging_for_ner/data/test
new file mode 100644
index 00000000..66163e1a
--- /dev/null
+++ b/sequence_tagging_for_ner/data/test
@@ -0,0 +1,128 @@
+CRICKET NNP I-NP O
+- : O O
+LEICESTERSHIRE NNP I-NP I-ORG
+TAKE NNP I-NP O
+OVER IN I-PP O
+AT NNP I-NP O
+TOP NNP I-NP O
+AFTER NNP I-NP O
+INNINGS NNP I-NP O
+VICTORY NN I-NP O
+. . O O
+
+LONDON NNP I-NP I-LOC
+1996-08-30 CD I-NP O
+
+West NNP I-NP I-MISC
+Indian NNP I-NP I-MISC
+all-rounder NN I-NP O
+Phil NNP I-NP I-PER
+Simmons NNP I-NP I-PER
+took VBD I-VP O
+four CD I-NP O
+for IN I-PP O
+38 CD I-NP O
+on IN I-PP O
+Friday NNP I-NP O
+as IN I-PP O
+Leicestershire NNP I-NP I-ORG
+beat VBD I-VP O
+Somerset NNP I-NP I-ORG
+by IN I-PP O
+an DT I-NP O
+innings NN I-NP O
+and CC O O
+39 CD I-NP O
+runs NNS I-NP O
+in IN I-PP O
+two CD I-NP O
+days NNS I-NP O
+to TO I-VP O
+take VB I-VP O
+over IN I-PP O
+at IN B-PP O
+the DT I-NP O
+head NN I-NP O
+of IN I-PP O
+the DT I-NP O
+county NN I-NP O
+championship NN I-NP O
+. . O O
+
+Their PRP$ I-NP O
+stay NN I-NP O
+on IN I-PP O
+top NN I-NP O
+, , O O
+though RB I-ADVP O
+, , O O
+may MD I-VP O
+be VB I-VP O
+short-lived JJ I-ADJP O
+as IN I-PP O
+title NN I-NP O
+rivals NNS I-NP O
+Essex NNP I-NP I-ORG
+, , O O
+Derbyshire NNP I-NP I-ORG
+and CC I-NP O
+Surrey NNP I-NP I-ORG
+all DT O O
+closed VBD I-VP O
+in RP I-PRT O
+on IN I-PP O
+victory NN I-NP O
+while IN I-SBAR O
+Kent NNP I-NP I-ORG
+made VBD I-VP O
+up RP I-PRT O
+for IN I-PP O
+lost VBN I-NP O
+time NN I-NP O
+in IN I-PP O
+their PRP$ I-NP O
+rain-affected JJ I-NP O
+match NN I-NP O
+against IN I-PP O
+Nottinghamshire NNP I-NP I-ORG
+. . O O
+
+After IN I-PP O
+bowling VBG I-NP O
+Somerset NNP I-NP I-ORG
+out RP I-PRT O
+for IN I-PP O
+83 CD I-NP O
+on IN I-PP O
+the DT I-NP O
+opening NN I-NP O
+morning NN I-NP O
+at IN I-PP O
+Grace NNP I-NP I-LOC
+Road NNP I-NP I-LOC
+, , O O
+Leicestershire NNP I-NP I-ORG
+extended VBD I-VP O
+their PRP$ I-NP O
+first JJ I-NP O
+innings NN I-NP O
+by IN I-PP O
+94 CD I-NP O
+runs VBZ I-VP O
+before IN I-PP O
+being VBG I-VP O
+bowled VBD I-VP O
+out RP I-PRT O
+for IN I-PP O
+296 CD I-NP O
+with IN I-PP O
+England NNP I-NP I-LOC
+discard VBP I-VP O
+Andy NNP I-NP I-PER
+Caddick NNP I-NP I-PER
+taking VBG I-VP O
+three CD I-NP O
+for IN I-PP O
+83 CD I-NP O
+. . O O
+
diff --git a/sequence_tagging_for_ner/data/train b/sequence_tagging_for_ner/data/train
new file mode 100644
index 00000000..cbf3e678
--- /dev/null
+++ b/sequence_tagging_for_ner/data/train
@@ -0,0 +1,139 @@
+EU NNP I-NP I-ORG
+rejects VBZ I-VP O
+German JJ I-NP I-MISC
+call NN I-NP O
+to TO I-VP O
+boycott VB I-VP O
+British JJ I-NP I-MISC
+lamb NN I-NP O
+. . O O
+
+Peter NNP I-NP I-PER
+Blackburn NNP I-NP I-PER
+
+BRUSSELS NNP I-NP I-LOC
+1996-08-22 CD I-NP O
+
+The DT I-NP O
+European NNP I-NP I-ORG
+Commission NNP I-NP I-ORG
+said VBD I-VP O
+on IN I-PP O
+Thursday NNP I-NP O
+it PRP B-NP O
+disagreed VBD I-VP O
+with IN I-PP O
+German JJ I-NP I-MISC
+advice NN I-NP O
+to TO I-PP O
+consumers NNS I-NP O
+to TO I-VP O
+shun VB I-VP O
+British JJ I-NP I-MISC
+lamb NN I-NP O
+until IN I-SBAR O
+scientists NNS I-NP O
+determine VBP I-VP O
+whether IN I-SBAR O
+mad JJ I-NP O
+cow NN I-NP O
+disease NN I-NP O
+can MD I-VP O
+be VB I-VP O
+transmitted VBN I-VP O
+to TO I-PP O
+sheep NN I-NP O
+. . O O
+
+Germany NNP I-NP I-LOC
+'s POS B-NP O
+representative NN I-NP O
+to TO I-PP O
+the DT I-NP O
+European NNP I-NP I-ORG
+Union NNP I-NP I-ORG
+'s POS B-NP O
+veterinary JJ I-NP O
+committee NN I-NP O
+Werner NNP I-NP I-PER
+Zwingmann NNP I-NP I-PER
+said VBD I-VP O
+on IN I-PP O
+Wednesday NNP I-NP O
+consumers NNS I-NP O
+should MD I-VP O
+buy VB I-VP O
+sheepmeat NN I-NP O
+from IN I-PP O
+countries NNS I-NP O
+other JJ I-ADJP O
+than IN I-PP O
+Britain NNP I-NP I-LOC
+until IN I-SBAR O
+the DT I-NP O
+scientific JJ I-NP O
+advice NN I-NP O
+was VBD I-VP O
+clearer JJR I-ADJP O
+. . O O
+
+" " O O
+We PRP I-NP O
+do VBP I-VP O
+n't RB I-VP O
+support VB I-VP O
+any DT I-NP O
+such JJ I-NP O
+recommendation NN I-NP O
+because IN I-SBAR O
+we PRP I-NP O
+do VBP I-VP O
+n't RB I-VP O
+see VB I-VP O
+any DT I-NP O
+grounds NNS I-NP O
+for IN I-PP O
+it PRP I-NP O
+, , O O
+" " O O
+the DT I-NP O
+Commission NNP I-NP I-ORG
+'s POS B-NP O
+chief JJ I-NP O
+spokesman NN I-NP O
+Nikolaus NNP I-NP I-PER
+van NNP I-NP I-PER
+der FW I-NP I-PER
+Pas NNP I-NP I-PER
+told VBD I-VP O
+a DT I-NP O
+news NN I-NP O
+briefing NN I-NP O
+. . O O
+
+He PRP I-NP O
+said VBD I-VP O
+further JJ I-NP O
+scientific JJ I-NP O
+study NN I-NP O
+was VBD I-VP O
+required VBN I-VP O
+and CC O O
+if IN I-SBAR O
+it PRP I-NP O
+was VBD I-VP O
+found VBN I-VP O
+that IN I-SBAR O
+action NN I-NP O
+was VBD I-VP O
+needed VBN I-VP O
+it PRP I-NP O
+should MD I-VP O
+be VB I-VP O
+taken VBN I-VP O
+by IN I-PP O
+the DT I-NP O
+European NNP I-NP I-ORG
+Union NNP I-NP I-ORG
+. . O O
+
diff --git a/sequence_tagging_for_ner/download.sh b/sequence_tagging_for_ner/download.sh
new file mode 100644
index 00000000..861f943e
--- /dev/null
+++ b/sequence_tagging_for_ner/download.sh
@@ -0,0 +1,15 @@
+if [ -f assignment2.zip ]; then
+    echo "data exist"
+else
+    wget http://cs224d.stanford.edu/assignment2/assignment2.zip
+fi
+
+if [ $? -eq 0  ];then
+    unzip assignment2.zip
+    cp assignment2_release/data/ner/wordVectors.txt ./data
+    cp assignment2_release/data/ner/vocab.txt ./data
+    rm -rf assignment2.zip assignment2_release
+else
+  echo "download data error!" >> /dev/stderr
+  exit 1
+fi
diff --git a/sequence_tagging_for_ner/imgs/convergence_curve.png b/sequence_tagging_for_ner/imgs/convergence_curve.png
new file mode 100644
index 00000000..6b862b75
Binary files /dev/null and b/sequence_tagging_for_ner/imgs/convergence_curve.png differ
diff --git a/sequence_tagging_for_ner/infer.py b/sequence_tagging_for_ner/infer.py
new file mode 100644
index 00000000..2d0bd949
--- /dev/null
+++ b/sequence_tagging_for_ner/infer.py
@@ -0,0 +1,71 @@
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+from network_conf import ner_net
+import reader
+from utils import load_dict, load_reverse_dict
+from utils_extend import to_lodtensor
+
+
+def infer(model_path, batch_size, test_data_file, vocab_file, target_file,
+          use_gpu):
+    """
+    use the model under model_path to predict the test data, the result will be printed on the screen
+
+    return nothing
+    """
+    word_dict = load_dict(vocab_file)
+    word_reverse_dict = load_reverse_dict(vocab_file)
+
+    label_dict = load_dict(target_file)
+    label_reverse_dict = load_reverse_dict(target_file)
+
+    test_data = paddle.batch(
+        reader.data_reader(test_data_file, word_dict, label_dict),
+        batch_size=batch_size)
+    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+        for data in test_data():
+            word = to_lodtensor(map(lambda x: x[0], data), place)
+            mark = to_lodtensor(map(lambda x: x[1], data), place)
+            target = to_lodtensor(map(lambda x: x[2], data), place)
+            crf_decode = exe.run(
+                inference_program,
+                feed={"word": word,
+                      "mark": mark,
+                      "target": target},
+                fetch_list=fetch_targets,
+                return_numpy=False)
+            lod_info = (crf_decode[0].lod())[0]
+            np_data = np.array(crf_decode[0])
+            assert len(data) == len(lod_info) - 1
+            for sen_index in xrange(len(data)):
+                assert len(data[sen_index][0]) == lod_info[
+                    sen_index + 1] - lod_info[sen_index]
+                word_index = 0
+                for tag_index in xrange(lod_info[sen_index],
+                                        lod_info[sen_index + 1]):
+                    word = word_reverse_dict[data[sen_index][0][word_index]]
+                    gold_tag = label_reverse_dict[data[sen_index][2][
+                        word_index]]
+                    tag = label_reverse_dict[np_data[tag_index][0]]
+                    print word + "\t" + gold_tag + "\t" + tag
+                    word_index += 1
+                print ""
+
+
+if __name__ == "__main__":
+    infer(
+        model_path="models/params_pass_0",
+        batch_size=6,
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt",
+        use_gpu=False)
diff --git a/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt b/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt
new file mode 100644
index 00000000..bbcc1bf4
--- /dev/null
+++ b/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt
@@ -0,0 +1 @@
+[0.04497942033021347]
\ No newline at end of file
diff --git a/sequence_tagging_for_ner/latest_kpis/pass_duration_factor.txt b/sequence_tagging_for_ner/latest_kpis/pass_duration_factor.txt
new file mode 100644
index 00000000..683e1d69
--- /dev/null
+++ b/sequence_tagging_for_ner/latest_kpis/pass_duration_factor.txt
@@ -0,0 +1 @@
+[0.021749680643496307]
diff --git a/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt b/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt
new file mode 100644
index 00000000..e7a19a6e
--- /dev/null
+++ b/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt
@@ -0,0 +1 @@
+[1.0]
\ No newline at end of file
diff --git a/sequence_tagging_for_ner/latest_kpis/train_acc_factor.txt b/sequence_tagging_for_ner/latest_kpis/train_acc_factor.txt
new file mode 100644
index 00000000..0ea64a68
--- /dev/null
+++ b/sequence_tagging_for_ner/latest_kpis/train_acc_factor.txt
@@ -0,0 +1 @@
+[1.0]
diff --git a/sequence_tagging_for_ner/network_conf.py b/sequence_tagging_for_ner/network_conf.py
new file mode 100644
index 00000000..3611d7b7
--- /dev/null
+++ b/sequence_tagging_for_ner/network_conf.py
@@ -0,0 +1,130 @@
+import math
+
+import paddle.fluid as fluid
+from paddle.fluid.initializer import NormalInitializer
+
+from utils import logger, load_dict, get_embedding
+
+
+def ner_net(word_dict_len, label_dict_len, parallel, stack_num=2):
+    mark_dict_len = 2
+    word_dim = 50
+    mark_dim = 5
+    hidden_dim = 300
+    IS_SPARSE = True
+    embedding_name = 'emb'
+
+    def _net_conf(word, mark, target):
+        word_embedding = fluid.layers.embedding(
+            input=word,
+            size=[word_dict_len, word_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False))
+
+        mark_embedding = fluid.layers.embedding(
+            input=mark,
+            size=[mark_dict_len, mark_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE)
+
+        word_caps_vector = fluid.layers.concat(
+            input=[word_embedding, mark_embedding], axis=1)
+        mix_hidden_lr = 1
+
+        rnn_para_attr = fluid.ParamAttr(
+            initializer=NormalInitializer(
+                loc=0.0, scale=0.0),
+            learning_rate=mix_hidden_lr)
+        hidden_para_attr = fluid.ParamAttr(
+            initializer=NormalInitializer(
+                loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)),
+            learning_rate=mix_hidden_lr)
+
+        hidden = fluid.layers.fc(
+            input=word_caps_vector,
+            name="__hidden00__",
+            size=hidden_dim,
+            act="tanh",
+            bias_attr=fluid.ParamAttr(initializer=NormalInitializer(
+                loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))),
+            param_attr=fluid.ParamAttr(initializer=NormalInitializer(
+                loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))))
+        fea = []
+        for direction in ["fwd", "bwd"]:
+            for i in range(stack_num):
+                if i != 0:
+                    hidden = fluid.layers.fc(
+                        name="__hidden%02d_%s__" % (i, direction),
+                        size=hidden_dim,
+                        act="stanh",
+                        bias_attr=fluid.ParamAttr(
+                            initializer=NormalInitializer(
+                                loc=0.0, scale=1.0)),
+                        input=[hidden, rnn[0], rnn[1]],
+                        param_attr=[
+                            hidden_para_attr, rnn_para_attr, rnn_para_attr
+                        ])
+                rnn = fluid.layers.dynamic_lstm(
+                    name="__rnn%02d_%s__" % (i, direction),
+                    input=hidden,
+                    size=hidden_dim,
+                    candidate_activation='relu',
+                    gate_activation='sigmoid',
+                    cell_activation='sigmoid',
+                    bias_attr=fluid.ParamAttr(initializer=NormalInitializer(
+                        loc=0.0, scale=1.0)),
+                    is_reverse=(i % 2) if direction == "fwd" else not i % 2,
+                    param_attr=rnn_para_attr)
+            fea += [hidden, rnn[0], rnn[1]]
+
+        rnn_fea = fluid.layers.fc(
+            size=hidden_dim,
+            bias_attr=fluid.ParamAttr(initializer=NormalInitializer(
+                loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))),
+            act="stanh",
+            input=fea,
+            param_attr=[hidden_para_attr, rnn_para_attr, rnn_para_attr] * 2)
+
+        emission = fluid.layers.fc(
+            size=label_dict_len,
+            input=rnn_fea,
+            param_attr=fluid.ParamAttr(initializer=NormalInitializer(
+                loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))))
+
+        crf_cost = fluid.layers.linear_chain_crf(
+            input=emission,
+            label=target,
+            param_attr=fluid.ParamAttr(
+                name='crfw',
+                initializer=NormalInitializer(
+                    loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)),
+                learning_rate=mix_hidden_lr))
+        avg_cost = fluid.layers.mean(x=crf_cost)
+        return avg_cost, emission
+
+    word = fluid.layers.data(
+        name='word', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark', shape=[1], dtype='int64', lod_level=1)
+    target = fluid.layers.data(
+        name="target", shape=[1], dtype='int64', lod_level=1)
+
+    if parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            word_ = pd.read_input(word)
+            mark_ = pd.read_input(mark)
+            target_ = pd.read_input(target)
+            avg_cost, emission_base = _net_conf(word_, mark_, target_)
+            pd.write_output(avg_cost)
+            pd.write_output(emission_base)
+        avg_cost_list, emission = pd()
+        avg_cost = fluid.layers.mean(x=avg_cost_list)
+        emission.stop_gradient = True
+    else:
+        avg_cost, emission = _net_conf(word, mark, target)
+
+    return avg_cost, emission, word, mark, target
diff --git a/sequence_tagging_for_ner/reader.py b/sequence_tagging_for_ner/reader.py
new file mode 100644
index 00000000..5050d0bf
--- /dev/null
+++ b/sequence_tagging_for_ner/reader.py
@@ -0,0 +1,66 @@
+"""
+Conll03 dataset.
+"""
+
+from utils import *
+
+__all__ = ["data_reader"]
+
+
+def canonicalize_digits(word):
+    if any([c.isalpha() for c in word]): return word
+    word = re.sub("\d", "DG", word)
+    if word.startswith("DG"):
+        word = word.replace(",", "")  # remove thousands separator
+    return word
+
+
+def canonicalize_word(word, wordset=None, digits=True):
+    word = word.lower()
+    if digits:
+        if (wordset != None) and (word in wordset): return word
+        word = canonicalize_digits(word)  # try to canonicalize numbers
+    if (wordset == None) or (word in wordset): return word
+    else: return "UUUNKKK"  # unknown token
+
+
+def data_reader(data_file, word_dict, label_dict):
+    """
+    The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
+    It returns a reader creator, each sample in the reader includes:
+    word id sequence, label id sequence and raw sentence.
+
+    :return: reader creator
+    :rtype: callable
+    """
+
+    def reader():
+        UNK_IDX = word_dict["UUUNKKK"]
+
+        sentence = []
+        labels = []
+        with open(data_file, "r") as f:
+            for line in f:
+                if len(line.strip()) == 0:
+                    if len(sentence) > 0:
+                        word_idx = [
+                            word_dict.get(
+                                canonicalize_word(w, word_dict), UNK_IDX)
+                            for w in sentence
+                        ]
+                        mark = [1 if w[0].isupper() else 0 for w in sentence]
+                        label_idx = [label_dict[l] for l in labels]
+                        yield word_idx, mark, label_idx
+                    sentence = []
+                    labels = []
+                else:
+                    segs = line.strip().split()
+                    sentence.append(segs[0])
+                    # transform I-TYPE to BIO schema
+                    if segs[-1] != "O" and (len(labels) == 0 or
+                                            labels[-1][1:] != segs[-1][1:]):
+                        labels.append("B" + segs[-1][1:])
+                    else:
+                        labels.append(segs[-1])
+
+    return reader
diff --git a/sequence_tagging_for_ner/run.xsh b/sequence_tagging_for_ner/run.xsh
new file mode 100755
index 00000000..9fda2d21
--- /dev/null
+++ b/sequence_tagging_for_ner/run.xsh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+cudaid=${sequence_tagging:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+#pass_num 2200
+sh download.sh
+FLAGS_benchmark=true  python train.py
+
+cudaid=${sequence_tagging_m:=0,1,2,3} # use multi card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+#pass_num 2200
+sh download.sh
+FLAGS_benchmark=true  python train.py --gpu_card_num 4
diff --git a/sequence_tagging_for_ner/train.py b/sequence_tagging_for_ner/train.py
new file mode 100644
index 00000000..a80e75b1
--- /dev/null
+++ b/sequence_tagging_for_ner/train.py
@@ -0,0 +1,149 @@
+import os
+import time
+import math
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import argparse
+import reader
+from network_conf import ner_net
+from utils import logger, load_dict
+from utils_extend import to_lodtensor, get_embedding
+from continuous_evaluation import *
+
+def parse_args():
+    parser = argparse.ArgumentParser("sequence_tagging_for_ner model benchmark.")
+    parser.add_argument(
+        '--gpu_card_num', type=int, default=1, help='gpu card num used.')
+
+    args = parser.parse_args()
+    return args
+
+def test(exe, chunk_evaluator, inference_program, test_data, place):
+    chunk_evaluator.reset(exe)
+    for data in test_data():
+        word = to_lodtensor(map(lambda x: x[0], data), place)
+        mark = to_lodtensor(map(lambda x: x[1], data), place)
+        target = to_lodtensor(map(lambda x: x[2], data), place)
+        acc = exe.run(inference_program,
+                      feed={"word": word,
+                            "mark": mark,
+                            "target": target})
+    return chunk_evaluator.eval(exe)
+
+
+def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
+         model_save_dir, num_passes, use_gpu, parallel):
+
+    args = parse_args()
+    if not os.path.exists(model_save_dir):
+        os.mkdir(model_save_dir)
+
+    BATCH_SIZE = 200
+    word_dict = load_dict(vocab_file)
+    label_dict = load_dict(target_file)
+
+    word_vector_values = get_embedding(emb_file)
+
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+
+    avg_cost, feature_out, word, mark, target = ner_net(
+        word_dict_len, label_dict_len, parallel)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
+    sgd_optimizer.minimize(avg_cost)
+
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_target = chunk_evaluator.metrics + chunk_evaluator.states
+        inference_program = fluid.io.get_inference_program(test_target)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.data_reader(train_data_file, word_dict, label_dict),
+            buf_size=20000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.data_reader(test_data_file, word_dict, label_dict),
+            buf_size=20000),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    embedding_name = 'emb'
+    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor(
+    )
+    embedding_param.set(word_vector_values, place)
+
+    batch_id = 0
+    total_time = 0.0
+    for pass_id in xrange(num_passes):
+        chunk_evaluator.reset(exe)
+        start_time = time.time()
+        for data in train_reader():
+            cost, batch_precision, batch_recall, batch_f1_score = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost] + chunk_evaluator.metrics)
+            batch_id = batch_id + 1
+        t1 = time.time()
+        total_time += t1 - start_time
+        pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
+        if pass_id == num_passes - 1:
+            if args.gpu_card_num == 1:
+                train_acc_kpi.add_record(pass_precision)
+                pass_duration_kpi.add_record(total_time / num_passes)
+            else:
+                train_acc_kpi_card4.add_record(pass_precision)
+                pass_duration_kpi_card4.add_record(total_time / num_passes)
+
+        if pass_id % 100 == 0:
+            print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" +
+                  str(pass_precision) + " pass_recall:" + str(
+                      pass_recall) + " pass_f1_score:" + str(pass_f1_score))
+        pass_precision, pass_recall, pass_f1_score = test(
+            exe, chunk_evaluator, inference_program, test_reader, place)
+        if pass_id % 100 == 0:
+            print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" +
+                  str(pass_precision) + " pass_recall:" + str(
+                      pass_recall) + " pass_f1_score:" + str(pass_f1_score))
+
+        #save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
+        #fluid.io.save_inference_model(
+        #    save_dirname, ['word', 'mark', 'target'], [crf_decode], exe)
+
+    if args.gpu_card_num == 1:
+        train_acc_kpi.persist()
+        pass_duration_kpi.persist()
+    else:
+        train_acc_kpi_card4.persist()
+        pass_duration_kpi_card4.persist()
+
+
+if __name__ == "__main__":
+    main(
+        train_data_file="data/train",
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt",
+        emb_file="data/wordVectors.txt",
+        model_save_dir="models",
+        num_passes=2300,
+        use_gpu=True,
+        parallel=True)
diff --git a/sequence_tagging_for_ner/utils.py b/sequence_tagging_for_ner/utils.py
new file mode 100644
index 00000000..f40f1bb1
--- /dev/null
+++ b/sequence_tagging_for_ner/utils.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import os
+import re
+import argparse
+import numpy as np
+from collections import defaultdict
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+def get_embedding(emb_file='data/wordVectors.txt'):
+    """
+    Get the trained word vector.
+    """
+    return np.loadtxt(emb_file, dtype=float)
+
+
+def load_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+
+    This function takes the first column (columns in a line are seperated by
+    tab) as key and takes line number of a line as the key (index of the word
+    in the dictionary).
+    """
+
+    return dict((line.strip().split("\t")[0], idx)
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
+
+
+def load_reverse_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+
+    This function takes line number of a line as the key (index of the word in
+    the dictionary) and the first column (columns in a line are seperated by
+    tab) as the value.
+    """
+    return dict((idx, line.strip().split("\t")[0])
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
diff --git a/sequence_tagging_for_ner/utils_extend.py b/sequence_tagging_for_ner/utils_extend.py
new file mode 100644
index 00000000..03e7e62f
--- /dev/null
+++ b/sequence_tagging_for_ner/utils_extend.py
@@ -0,0 +1,28 @@
+import numpy as np
+
+import paddle.fluid as fluid
+
+
+def get_embedding(emb_file='data/wordVectors.txt'):
+    """
+    Get the trained word vector.
+    """
+    return np.loadtxt(emb_file, dtype='float32')
+
+
+def to_lodtensor(data, place):
+    """
+    convert data to lodtensor
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
diff --git a/text_classification/README.md b/text_classification/README.md
new file mode 100644
index 00000000..7855f6c2
--- /dev/null
+++ b/text_classification/README.md
@@ -0,0 +1,113 @@
+# 文本分类
+
+以下是本例的简要目录结构及说明：
+
+```text
+.
+├── nets.py              # 模型定义
+├── README.md            # 文档
+├── train.py             # 训练脚本
+├── infer.py             # 预测脚本
+└── utils.py             # 定义通用函数，从外部获取
+```
+
+
+## 简介，模型详解
+
+在PaddlePaddle v2版本[文本分类](https://github.com/PaddlePaddle/models/blob/develop/text/README.md)中对于文本分类任务有较详细的介绍，在本例中不再重复介绍。
+在模型上，我们采用了bow, cnn, lstm, gru四种常见的文本分类模型。
+
+## 训练
+
+1. 运行命令 `python train.py bow` 开始训练模型。
+    ```python
+    python train.py bow    # bow指定网络结构，可替换成cnn, lstm, gru
+    ```
+
+2. (可选）想自定义网络结构，需在[nets.py](./nets.py)中自行添加，并设置[train.py](./train.py)中的相应参数。
+    ```python
+    def train(train_reader,     # 训练数据
+        word_dict,              # 数据字典
+        network,                # 模型配置
+        use_cuda,               # 是否用GPU
+        parallel,               # 是否并行
+        save_dirname,           # 保存模型路径
+        lr=0.2,                 # 学习率大小
+        batch_size=128,         # 每个batch的样本数
+        pass_num=30):           # 训练的轮数
+    ```
+
+## 训练结果示例
+```text
+    pass_id: 0, avg_acc: 0.848040, avg_cost: 0.354073
+    pass_id: 1, avg_acc: 0.914200, avg_cost: 0.217945
+    pass_id: 2, avg_acc: 0.929800, avg_cost: 0.184302
+    pass_id: 3, avg_acc: 0.938680, avg_cost: 0.164240
+    pass_id: 4, avg_acc: 0.945120, avg_cost: 0.149150
+    pass_id: 5, avg_acc: 0.951280, avg_cost: 0.137117
+    pass_id: 6, avg_acc: 0.955360, avg_cost: 0.126434
+    pass_id: 7, avg_acc: 0.961400, avg_cost: 0.117405
+    pass_id: 8, avg_acc: 0.963560, avg_cost: 0.110070
+    pass_id: 9, avg_acc: 0.965840, avg_cost: 0.103273
+    pass_id: 10, avg_acc: 0.969800, avg_cost: 0.096314
+    pass_id: 11, avg_acc: 0.971720, avg_cost: 0.090206
+    pass_id: 12, avg_acc: 0.974800, avg_cost: 0.084970
+    pass_id: 13, avg_acc: 0.977400, avg_cost: 0.078981
+    pass_id: 14, avg_acc: 0.980000, avg_cost: 0.073685
+    pass_id: 15, avg_acc: 0.981080, avg_cost: 0.069898
+    pass_id: 16, avg_acc: 0.982080, avg_cost: 0.064923
+    pass_id: 17, avg_acc: 0.984680, avg_cost: 0.060861
+    pass_id: 18, avg_acc: 0.985840, avg_cost: 0.057095
+    pass_id: 19, avg_acc: 0.988080, avg_cost: 0.052424
+    pass_id: 20, avg_acc: 0.989160, avg_cost: 0.049059
+    pass_id: 21, avg_acc: 0.990120, avg_cost: 0.045882
+    pass_id: 22, avg_acc: 0.992080, avg_cost: 0.042140
+    pass_id: 23, avg_acc: 0.992280, avg_cost: 0.039722
+    pass_id: 24, avg_acc: 0.992840, avg_cost: 0.036607
+    pass_id: 25, avg_acc: 0.994440, avg_cost: 0.034040
+    pass_id: 26, avg_acc: 0.995000, avg_cost: 0.031501
+    pass_id: 27, avg_acc: 0.995440, avg_cost: 0.028988
+    pass_id: 28, avg_acc: 0.996240, avg_cost: 0.026639
+    pass_id: 29, avg_acc: 0.996960, avg_cost: 0.024186
+```
+
+## 预测
+1. 运行命令 `python infer.py bow_model`, 开始预测。
+    ```python
+    python infer.py bow_model     # bow_model指定需要导入的模型
+
+## 预测结果示例
+```text
+    model_path: bow_model/epoch0, avg_acc: 0.882800
+    model_path: bow_model/epoch1, avg_acc: 0.882360
+    model_path: bow_model/epoch2, avg_acc: 0.881400
+    model_path: bow_model/epoch3, avg_acc: 0.877800
+    model_path: bow_model/epoch4, avg_acc: 0.872920
+    model_path: bow_model/epoch5, avg_acc: 0.872640
+    model_path: bow_model/epoch6, avg_acc: 0.869960
+    model_path: bow_model/epoch7, avg_acc: 0.865160
+    model_path: bow_model/epoch8, avg_acc: 0.863680
+    model_path: bow_model/epoch9, avg_acc: 0.861200
+    model_path: bow_model/epoch10, avg_acc: 0.853520
+    model_path: bow_model/epoch11, avg_acc: 0.850400
+    model_path: bow_model/epoch12, avg_acc: 0.855960
+    model_path: bow_model/epoch13, avg_acc: 0.853480
+    model_path: bow_model/epoch14, avg_acc: 0.855960
+    model_path: bow_model/epoch15, avg_acc: 0.854120
+    model_path: bow_model/epoch16, avg_acc: 0.854160
+    model_path: bow_model/epoch17, avg_acc: 0.852240
+    model_path: bow_model/epoch18, avg_acc: 0.852320
+    model_path: bow_model/epoch19, avg_acc: 0.850280
+    model_path: bow_model/epoch20, avg_acc: 0.849760
+    model_path: bow_model/epoch21, avg_acc: 0.850160
+    model_path: bow_model/epoch22, avg_acc: 0.846800
+    model_path: bow_model/epoch23, avg_acc: 0.845440
+    model_path: bow_model/epoch24, avg_acc: 0.845640
+    model_path: bow_model/epoch25, avg_acc: 0.846200
+    model_path: bow_model/epoch26, avg_acc: 0.845880
+    model_path: bow_model/epoch27, avg_acc: 0.844880
+    model_path: bow_model/epoch28, avg_acc: 0.844680
+    model_path: bow_model/epoch29, avg_acc: 0.844960
+```
+
+注：过拟合导致acc持续下降，请忽略
diff --git a/text_classification/continuous_evaluation.py b/text_classification/continuous_evaluation.py
new file mode 100644
index 00000000..9d9c9240
--- /dev/null
+++ b/text_classification/continuous_evaluation.py
@@ -0,0 +1,19 @@
+"""
+continuous_evaluation.py
+"""
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+lstm_train_cost_kpi = CostKpi('lstm_train_cost', 5, 0)
+lstm_pass_duration_kpi = DurationKpi('lstm_pass_duration', 0.02, 0, actived=True)
+
+lstm_train_cost_kpi_card4 = CostKpi('lstm_train_cost_card4', 0.2, 0)
+lstm_pass_duration_kpi_card4 = DurationKpi('lstm_pass_duration_card4', 0.02, 0, actived=True)
+
+tracking_kpis = [
+              lstm_train_cost_kpi, lstm_pass_duration_kpi,
+              lstm_train_cost_kpi_card4, lstm_pass_duration_kpi_card4,
+                ]
diff --git a/text_classification/infer.py b/text_classification/infer.py
new file mode 100644
index 00000000..d2a0363d
--- /dev/null
+++ b/text_classification/infer.py
@@ -0,0 +1,50 @@
+import sys
+import time
+import unittest
+import contextlib
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+import utils
+
+
+def infer(test_reader, use_cuda, model_path=None):
+    """
+    inference function
+    """
+    if model_path is None:
+        print(str(model_path) + " cannot be found")
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        total_acc = 0.0
+        total_count = 0
+        for data in test_reader():
+            acc = exe.run(inference_program,
+                          feed=utils.data2tensor(data, place),
+                          fetch_list=fetch_targets,
+                          return_numpy=True)
+            total_acc += acc[0] * len(data)
+            total_count += len(data)
+
+        avg_acc = total_acc / total_count
+        print("model_path: %s, avg_acc: %f" % (model_path, avg_acc))
+
+
+if __name__ == "__main__":
+    word_dict, train_reader, test_reader = utils.prepare_data(
+        "imdb", self_dict=False, batch_size=128, buf_size=50000)
+
+    model_path = sys.argv[1]
+    for i in range(30):
+        epoch_path = model_path + "/" + "epoch" + str(i)
+        infer(test_reader, use_cuda=False, model_path=epoch_path)
diff --git a/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt b/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt
new file mode 100644
index 00000000..bfd66206
--- /dev/null
+++ b/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt
@@ -0,0 +1 @@
+[17.750867716471355]
\ No newline at end of file
diff --git a/text_classification/latest_kpis/lstm_pass_duration_factor.txt b/text_classification/latest_kpis/lstm_pass_duration_factor.txt
new file mode 100644
index 00000000..60ab6882
--- /dev/null
+++ b/text_classification/latest_kpis/lstm_pass_duration_factor.txt
@@ -0,0 +1 @@
+[15.24635027249654]
diff --git a/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt b/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt
new file mode 100644
index 00000000..f8d4e66e
--- /dev/null
+++ b/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt
@@ -0,0 +1 @@
+[0.0030332264248281717]
diff --git a/text_classification/latest_kpis/lstm_train_cost_factor.txt b/text_classification/latest_kpis/lstm_train_cost_factor.txt
new file mode 100644
index 00000000..1224335d
--- /dev/null
+++ b/text_classification/latest_kpis/lstm_train_cost_factor.txt
@@ -0,0 +1 @@
+[0.000792166159953922]
diff --git a/text_classification/nets.py b/text_classification/nets.py
new file mode 100644
index 00000000..cd572c72
--- /dev/null
+++ b/text_classification/nets.py
@@ -0,0 +1,124 @@
+import sys
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    bow net
+    """
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def cnn_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            win_size=3):
+    """
+    conv net
+    """
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=win_size,
+        act="tanh",
+        pool_type="max")
+
+    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
+
+    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    """
+    lstm net
+    """
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def gru_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=400.0):
+    """
+    gru net
+    """
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
+    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
+    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
diff --git a/text_classification/run.xsh b/text_classification/run.xsh
new file mode 100755
index 00000000..29c8faab
--- /dev/null
+++ b/text_classification/run.xsh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+cudaid=${text_classification:=0}
+export CUDA_VISIBLE_DEVICES=$cudaid
+FLAGS_benchmark=true  python train.py --model lstm
+
+cudaid=${text_classification_m:=0,1,2,3} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+#LSTM pass_num 15
+FLAGS_benchmark=true  python train.py --model lstm --gpu_card_num 4
diff --git a/text_classification/train.py b/text_classification/train.py
new file mode 100644
index 00000000..dfb3f877
--- /dev/null
+++ b/text_classification/train.py
@@ -0,0 +1,162 @@
+import sys
+import time
+import unittest
+import contextlib
+
+import paddle.fluid as fluid
+import paddle
+import argparse
+import utils
+from nets import bow_net
+from nets import cnn_net
+from nets import lstm_net
+from nets import gru_net
+from continuous_evaluation import *
+fluid.default_startup_program().random_seed = 99
+
+def parse_args():
+    parser = argparse.ArgumentParser("text_classification model benchmark.")
+    parser.add_argument(
+        '--model', type=str, default="lstm", help='model to run.')
+    parser.add_argument(
+        '--gpu_card_num', type=int, default=1, help='gpu card num used.')
+
+    args = parser.parse_args()
+    return args
+
+def train(train_reader,
+          word_dict,
+          network,
+          use_cuda,
+          parallel,
+          save_dirname,
+          lr=0.2,
+          batch_size=128,
+          pass_num=30):
+    """
+    train network
+    """
+    args = parse_args()
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    if not parallel:
+        cost, acc, prediction = network(data, label, len(word_dict))
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            cost, acc, prediction = network(
+                pd.read_input(data), pd.read_input(label), len(word_dict))
+
+            pd.write_output(cost)
+            pd.write_output(acc)
+
+        cost, acc = pd()
+        cost = fluid.layers.mean(cost)
+        acc = fluid.layers.mean(acc)
+
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
+    sgd_optimizer.minimize(cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+    total_time = 0.0
+    newest_avg_cost = 0.0
+    for pass_id in xrange(pass_num):
+        start_time = time.time()
+        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
+        for data in train_reader():
+            avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
+                                              feed=feeder.feed(data),
+                                              fetch_list=[cost, acc])
+            data_size = len(data)
+            total_acc += data_size * avg_acc_np
+            total_cost += data_size * avg_cost_np
+            data_count += data_size
+        avg_cost = total_cost / data_count
+        newest_avg_cost = avg_cost
+        t1 = time.time()
+        total_time += t1 - start_time
+        avg_acc = total_acc / data_count
+        print("pass_id: %d, avg_acc: %f, avg_cost: %f" %
+              (pass_id, avg_acc, avg_cost))
+        if pass_id == pass_num - 1:
+            if args.gpu_card_num == 1:
+                lstm_train_cost_kpi.add_record(newest_avg_cost)
+                lstm_pass_duration_kpi.add_record(total_time / pass_num)
+            else:
+                lstm_train_cost_kpi_card4.add_record(newest_avg_cost)
+                lstm_pass_duration_kpi_card4.add_record(total_time / pass_num)
+
+        epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
+        fluid.io.save_inference_model(epoch_model, ["words", "label"], acc,
+                                      exe)
+    if args.gpu_card_num == 1:
+        lstm_train_cost_kpi.persist()
+        lstm_pass_duration_kpi.persist()
+    else:
+        lstm_train_cost_kpi_card4.persist()
+        lstm_pass_duration_kpi_card4.persist()
+
+def train_net():
+    args = parse_args()
+    word_dict, train_reader, test_reader = utils.prepare_data(
+        "imdb", self_dict=False, batch_size=128, buf_size=50000)
+
+    if args.model == "bow":
+        train(
+            train_reader,
+            word_dict,
+            bow_net,
+            use_cuda=False,
+            parallel=False,
+            save_dirname="bow_model",
+            lr=0.002,
+            pass_num=30,
+            batch_size=128)
+    elif args.model == "cnn":
+        train(
+            train_reader,
+            word_dict,
+            cnn_net,
+            use_cuda=True,
+            parallel=False,
+            save_dirname="cnn_model",
+            lr=0.01,
+            pass_num=30,
+            batch_size=4)
+    elif args.model == "lstm":
+        train(
+            train_reader,
+            word_dict,
+            lstm_net,
+            use_cuda=True,
+            parallel=True,
+            save_dirname="lstm_model",
+            lr=0.05,
+            pass_num=15,
+            batch_size=4)
+    elif args.model == "gru":
+        train(
+            train_reader,
+            word_dict,
+            lstm_net,
+            use_cuda=True,
+            parallel=False,
+            save_dirname="gru_model",
+            lr=0.05,
+            pass_num=30,
+            batch_size=128)
+    else:
+        print("network name cannot be found!")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    train_net()
diff --git a/text_classification/utils.py b/text_classification/utils.py
new file mode 100644
index 00000000..bff77d11
--- /dev/null
+++ b/text_classification/utils.py
@@ -0,0 +1,99 @@
+import sys
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle
+
+
+def to_lodtensor(data, place):
+    """
+    convert to LODtensor
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def load_vocab(filename):
+    """
+    load imdb vocabulary
+    """
+    vocab = {}
+    with open(filename) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    vocab["<unk>"] = len(vocab)
+    return vocab
+
+
+def data2tensor(data, place):
+    """
+    data2tensor
+    """
+    input_seq = to_lodtensor(map(lambda x: x[0], data), place)
+    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+    y_data = y_data.reshape([-1, 1])
+    return {"words": input_seq, "label": y_data}
+
+
+def prepare_data(data_type="imdb",
+                 self_dict=False,
+                 batch_size=128,
+                 buf_size=50000):
+    """
+    prepare data
+    """
+    if self_dict:
+        word_dict = load_vocab(data_type + ".vocab")
+    else:
+        if data_type == "imdb":
+            word_dict = paddle.dataset.imdb.word_dict()
+        elif data_type == "light_imdb":
+            word_dict = light_imdb.word_dict()
+        elif data_type == "tiny_imdb":
+            word_dict = tiny_imdb.word_dict()
+        else:
+            raise RuntimeError("No such dataset")
+
+    if data_type == "imdb":
+        train_reader = paddle.batch(
+                paddle.dataset.imdb.train(word_dict),
+            batch_size=batch_size)
+
+        test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), 
+            batch_size=batch_size)
+
+    elif data_type == "light_imdb":
+        train_reader = paddle.batch(
+                light_imdb.train(word_dict), 
+            batch_size=batch_size)
+
+        test_reader = paddle.batch(
+                light_imdb.test(word_dict),
+            batch_size=batch_size)
+
+    elif data_type == "tiny_imdb":
+        train_reader = paddle.batch(
+                tiny_imdb.train(word_dict),
+            batch_size=batch_size)
+
+        test_reader = paddle.batch(
+                tiny_imdb.test(word_dict),
+            batch_size=batch_size)
+    else:
+        raise RuntimeError("no such dataset")
+
+    return word_dict, train_reader, test_reader
diff --git a/transformer/continuous_evaluation.py b/transformer/continuous_evaluation.py
new file mode 100644
index 00000000..7a39755e
--- /dev/null
+++ b/transformer/continuous_evaluation.py
@@ -0,0 +1,12 @@
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+train_avg_ppl_kpi = CostKpi('train_avg_ppl_kpi', 0.2, 0)
+train_pass_duration_kpi = DurationKpi('train_pass_duration_kpi', 0.2, 0)
+
+tracking_kpis = [
+    train_avg_ppl_kpi,
+    train_pass_duration_kpi,
+]
diff --git a/transformer/infer.py b/transformer/infer.py
new file mode 100644
index 00000000..7d0c9776
--- /dev/null
+++ b/transformer/infer.py
@@ -0,0 +1,354 @@
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+
+import model
+from model import wrap_encoder as encoder
+from model import wrap_decoder as decoder
+from transformer_config import *
+from train import pad_batch_data
+
+
+def translate_batch(exe,
+                    src_words,
+                    encoder,
+                    enc_in_names,
+                    enc_out_names,
+                    decoder,
+                    dec_in_names,
+                    dec_out_names,
+                    beam_size,
+                    max_length,
+                    n_best,
+                    batch_size,
+                    n_head,
+                    d_model,
+                    src_pad_idx,
+                    trg_pad_idx,
+                    bos_idx,
+                    eos_idx,
+                    unk_idx,
+                    output_unk=True):
+    """
+    Run the encoder program once and run the decoder program multiple times to
+    implement beam search externally.
+    """
+    # Prepare data for encoder and run the encoder.
+    enc_in_data = pad_batch_data(
+        src_words,
+        src_pad_idx,
+        n_head,
+        is_target=False,
+        is_label=False,
+        return_attn_bias=True,
+        return_max_len=False)
+    # Append the data shape input to reshape the output of embedding layer.
+    enc_in_data = enc_in_data + [
+        np.array(
+            [-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
+    ]
+    # Append the shape inputs to reshape before and after softmax in encoder
+    # self attention.
+    enc_in_data = enc_in_data + [
+        np.array(
+            [-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array(
+                enc_in_data[2].shape, dtype="int32")
+    ]
+    enc_output = exe.run(encoder,
+                         feed=dict(zip(enc_in_names, enc_in_data)),
+                         fetch_list=enc_out_names)[0]
+
+    # Beam Search.
+    # To store the beam info.
+    scores = np.zeros((batch_size, beam_size), dtype="float32")
+    prev_branchs = [[] for i in range(batch_size)]
+    next_ids = [[] for i in range(batch_size)]
+    # Use beam_inst_map to map beam idx to the instance idx in batch, since the
+    # size of feeded batch is changing.
+    beam_inst_map = {
+        beam_idx: inst_idx
+        for inst_idx, beam_idx in enumerate(range(batch_size))
+    }
+    # Use active_beams to recode the alive.
+    active_beams = range(batch_size)
+
+    def beam_backtrace(prev_branchs, next_ids, n_best=beam_size):
+        """
+        Decode and select n_best sequences for one instance by backtrace.
+        """
+        seqs = []
+        for i in range(n_best):
+            k = i
+            seq = []
+            for j in range(len(prev_branchs) - 1, -1, -1):
+                seq.append(next_ids[j][k])
+                k = prev_branchs[j][k]
+            seq = seq[::-1]
+            # Add the <bos>, since next_ids don't include the <bos>.
+            seq = [bos_idx] + seq
+            seqs.append(seq)
+        return seqs
+
+    def init_dec_in_data(batch_size, beam_size, enc_in_data, enc_output):
+        """
+        Initialize the input data for decoder.
+        """
+        trg_words = np.array(
+            [[bos_idx]] * batch_size * beam_size, dtype="int64")
+        trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64")
+        src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[
+            -1], enc_in_data[2], 1
+        # This is used to remove attention on subsequent words.
+        trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len,
+                                     trg_max_len))
+        trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
+            [-1, 1, trg_max_len, trg_max_len])
+        trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
+                             [-1e9]).astype("float32")
+        # This is used to remove attention on the paddings of source sequences.
+        trg_src_attn_bias = np.tile(
+            src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis],
+            [1, beam_size, 1, trg_max_len, 1]).reshape([
+                -1, src_slf_attn_bias.shape[1], trg_max_len,
+                src_slf_attn_bias.shape[-1]
+            ])
+        # Append the shape input to reshape the output of embedding layer.
+        trg_data_shape = np.array(
+            [batch_size * beam_size, trg_max_len, d_model], dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # decoder self attention.
+        trg_slf_attn_pre_softmax_shape = np.array(
+            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+        trg_slf_attn_post_softmax_shape = np.array(
+            trg_slf_attn_bias.shape, dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # encoder-decoder attention.
+        trg_src_attn_pre_softmax_shape = np.array(
+            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+        trg_src_attn_post_softmax_shape = np.array(
+            trg_src_attn_bias.shape, dtype="int32")
+        enc_output = np.tile(
+            enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
+                [-1, enc_output.shape[-2], enc_output.shape[-1]])
+        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output
+
+    def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
+        """
+        Update the input data of decoder mainly by slicing from the previous
+        input data and dropping the finished instance beams.
+        """
+        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output = dec_in_data
+        trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
+        trg_words = np.array(
+            [
+                beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx])
+                for beam_idx in active_beams
+            ],
+            dtype="int64")
+        trg_words = trg_words.reshape([-1, 1])
+        trg_pos = np.array(
+            [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size,
+            dtype="int64").reshape([-1, 1])
+        active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams]
+        active_beams_indice = (
+            (np.array(active_beams) * beam_size)[:, np.newaxis] +
+            np.array(range(beam_size))[np.newaxis, :]).flatten()
+        # This is used to remove attention on subsequent words.
+        trg_slf_attn_bias = np.ones((len(active_beams) * beam_size,
+                                     trg_cur_len, trg_cur_len))
+        trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
+            [-1, 1, trg_cur_len, trg_cur_len])
+        trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
+                             [-1e9]).astype("float32")
+        # This is used to remove attention on the paddings of source sequences.
+        trg_src_attn_bias = np.tile(trg_src_attn_bias[
+            active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
+                                    [1, 1, trg_cur_len, 1])
+        # Append the shape input to reshape the output of embedding layer.
+        trg_data_shape = np.array(
+            [len(active_beams) * beam_size, trg_cur_len, d_model],
+            dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # decoder self attention.
+        trg_slf_attn_pre_softmax_shape = np.array(
+            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+        trg_slf_attn_post_softmax_shape = np.array(
+            trg_slf_attn_bias.shape, dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # encoder-decoder attention.
+        trg_src_attn_pre_softmax_shape = np.array(
+            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+        trg_src_attn_post_softmax_shape = np.array(
+            trg_src_attn_bias.shape, dtype="int32")
+        enc_output = enc_output[active_beams_indice, :, :]
+        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output
+
+    dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
+                                   enc_output)
+    for i in range(max_length):
+        predict_all = exe.run(decoder,
+                              feed=dict(zip(dec_in_names, dec_in_data)),
+                              fetch_list=dec_out_names)[0]
+        predict_all = np.log(
+            predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
+            [:, -1, :])
+        predict_all = (predict_all + scores[active_beams].reshape(
+            [len(beam_inst_map) * beam_size, -1])).reshape(
+                [len(beam_inst_map), beam_size, -1])
+        if not output_unk:  # To exclude the <unk> token.
+            predict_all[:, :, unk_idx] = -1e9
+        active_beams = []
+        for beam_idx in range(batch_size):
+            if not beam_inst_map.has_key(beam_idx):
+                continue
+            inst_idx = beam_inst_map[beam_idx]
+            predict = (predict_all[inst_idx, :, :]
+                       if i != 0 else predict_all[inst_idx, 0, :]).flatten()
+            top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
+            top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice])
+                                          [::-1]]
+            top_scores = predict[top_scores_ids]
+            scores[beam_idx] = top_scores
+            prev_branchs[beam_idx].append(top_scores_ids /
+                                          predict_all.shape[-1])
+            next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
+            if next_ids[beam_idx][-1][0] != eos_idx:
+                active_beams.append(beam_idx)
+        if len(active_beams) == 0:
+            break
+        dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams,
+                                         beam_inst_map)
+        beam_inst_map = {
+            beam_idx: inst_idx
+            for inst_idx, beam_idx in enumerate(active_beams)
+        }
+
+    # Decode beams and select n_best sequences for each instance by backtrace.
+    seqs = [
+        beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)
+        for beam_idx in range(batch_size)
+    ]
+
+    return seqs, scores[:, :n_best].tolist()
+
+
+def main():
+    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    encoder_program = fluid.Program()
+    with fluid.program_guard(main_program=encoder_program):
+        enc_output = encoder(
+            ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1,
+            ModelHyperParams.n_layer, ModelHyperParams.n_head,
+            ModelHyperParams.d_key, ModelHyperParams.d_value,
+            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+            ModelHyperParams.dropout)
+
+    decoder_program = fluid.Program()
+    with fluid.program_guard(main_program=decoder_program):
+        predict = decoder(
+            ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1,
+            ModelHyperParams.n_layer, ModelHyperParams.n_head,
+            ModelHyperParams.d_key, ModelHyperParams.d_value,
+            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+            ModelHyperParams.dropout)
+
+    # Load model parameters of encoder and decoder separately from the saved
+    # transformer model.
+    encoder_var_names = []
+    for op in encoder_program.block(0).ops:
+        encoder_var_names += op.input_arg_names
+    encoder_param_names = filter(
+        lambda var_name: isinstance(encoder_program.block(0).var(var_name),
+            fluid.framework.Parameter),
+        encoder_var_names)
+    encoder_params = map(encoder_program.block(0).var, encoder_param_names)
+    decoder_var_names = []
+    for op in decoder_program.block(0).ops:
+        decoder_var_names += op.input_arg_names
+    decoder_param_names = filter(
+        lambda var_name: isinstance(decoder_program.block(0).var(var_name),
+            fluid.framework.Parameter),
+        decoder_var_names)
+    decoder_params = map(decoder_program.block(0).var, decoder_param_names)
+    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=encoder_params)
+    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)
+
+    # This is used here to set dropout to the test mode.
+    encoder_program = fluid.io.get_inference_program(
+        target_vars=[enc_output], main_program=encoder_program)
+    decoder_program = fluid.io.get_inference_program(
+        target_vars=[predict], main_program=decoder_program)
+
+    test_data = paddle.batch(
+        paddle.dataset.wmt16.test(ModelHyperParams.src_vocab_size,
+                                  ModelHyperParams.trg_vocab_size),
+        batch_size=InferTaskConfig.batch_size)
+
+    trg_idx2word = paddle.dataset.wmt16.get_dict(
+        "de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True)
+
+    def post_process_seq(seq,
+                         bos_idx=ModelHyperParams.bos_idx,
+                         eos_idx=ModelHyperParams.eos_idx,
+                         output_bos=InferTaskConfig.output_bos,
+                         output_eos=InferTaskConfig.output_eos):
+        """
+        Post-process the beam-search decoded sequence. Truncate from the first
+        <eos> and remove the <bos> and <eos> tokens currently.
+        """
+        eos_pos = len(seq) - 1
+        for i, idx in enumerate(seq):
+            if idx == eos_idx:
+                eos_pos = i
+                break
+        seq = seq[:eos_pos + 1]
+        return filter(
+            lambda idx: (output_bos or idx != bos_idx) and \
+                (output_eos or idx != eos_idx),
+            seq)
+
+    for batch_id, data in enumerate(test_data()):
+        batch_seqs, batch_scores = translate_batch(
+            exe,
+            [item[0] for item in data],
+            encoder_program,
+            encoder_data_input_fields + encoder_util_input_fields,
+            [enc_output.name],
+            decoder_program,
+            decoder_data_input_fields[:-1] + decoder_util_input_fields +
+            (decoder_data_input_fields[-1], ),
+            [predict.name],
+            InferTaskConfig.beam_size,
+            InferTaskConfig.max_length,
+            InferTaskConfig.n_best,
+            len(data),
+            ModelHyperParams.n_head,
+            ModelHyperParams.d_model,
+            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
+            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
+            ModelHyperParams.bos_idx,
+            ModelHyperParams.eos_idx,
+            ModelHyperParams.unk_idx,
+            output_unk=InferTaskConfig.output_unk)
+        for i in range(len(batch_seqs)):
+            # Post-process the beam-search decoded sequences.
+            seqs = map(post_process_seq, batch_seqs[i])
+            scores = batch_scores[i]
+            for seq in seqs:
+                print(" ".join([trg_idx2word[idx] for idx in seq]))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/transformer/latest_kpis/train_avg_ppl_kpi_factor.txt b/transformer/latest_kpis/train_avg_ppl_kpi_factor.txt
new file mode 100644
index 00000000..4075807a
--- /dev/null
+++ b/transformer/latest_kpis/train_avg_ppl_kpi_factor.txt
@@ -0,0 +1 @@
+[19.267375946044922]
diff --git a/transformer/latest_kpis/train_pass_duration_kpi_factor.txt b/transformer/latest_kpis/train_pass_duration_kpi_factor.txt
new file mode 100644
index 00000000..3cf0a471
--- /dev/null
+++ b/transformer/latest_kpis/train_pass_duration_kpi_factor.txt
@@ -0,0 +1 @@
+[56.41797208786011]
diff --git a/transformer/model.py b/transformer/model.py
new file mode 100644
index 00000000..f2ffb88e
--- /dev/null
+++ b/transformer/model.py
@@ -0,0 +1,578 @@
+from functools import partial
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+from transformer_config import *
+
+
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    position_enc = np.array([[
+        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
+        for j in range(d_pos_vec)
+    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
+    return position_enc.astype("float32")
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         pre_softmax_shape=None,
+                         post_softmax_shape=None):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_key,
+                          fan_out=n_head * d_key),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_key,
+                          fan_out=n_head * d_key),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_value,
+                          fan_out=n_head * d_value),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        if n_head == 1:
+            return x
+
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, -1, n_head, hidden_size // n_head])
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(
+            x=trans_x,
+            shape=map(int, [0, -1, trans_x.shape[2] * trans_x.shape[3]]))
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_model,
+                                     dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        weights = layers.reshape(
+            x=layers.elementwise_add(
+                x=product, y=attn_bias) if attn_bias else product,
+            shape=[-1, product.shape[-1]],
+            actual_shape=pre_softmax_shape,
+            act="softmax")
+        weights = layers.reshape(
+            x=weights, shape=product.shape, actual_shape=post_softmax_shape)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=dropout_rate, is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         param_attr=fluid.initializer.Xavier(uniform=False),
+                         bias_attr=False,
+                         num_flatten_dims=2)
+    return proj_out
+
+
+def positionwise_feed_forward(x, d_inner_hid, d_hid):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       param_attr=fluid.initializer.Uniform(
+                           low=-(d_hid**-0.5), high=(d_hid**-0.5)),
+                       act="relu")
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=fluid.initializer.Uniform(
+                        low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.initializer.Constant(1.),
+                bias_attr=fluid.initializer.Constant(0.))
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out, dropout_prob=dropout_rate, is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def prepare_encoder(src_word,
+                    src_pos,
+                    src_vocab_size,
+                    src_emb_dim,
+                    src_max_len,
+                    dropout_rate=0.,
+                    src_data_shape=None,
+                    pos_enc_param_name=None):
+    """Add word embeddings and position encodings.
+    The output tensor has a shape of:
+    [batch_size, max_src_length_in_batch, d_model].
+    This module is used at the bottom of the encoder stacks.
+    """
+    src_word_emb = layers.embedding(
+        src_word,
+        size=[src_vocab_size, src_emb_dim],
+        param_attr=fluid.initializer.Normal(0., 1.))
+    src_pos_enc = layers.embedding(
+        src_pos,
+        size=[src_max_len, src_emb_dim],
+        param_attr=fluid.ParamAttr(
+            name=pos_enc_param_name, trainable=False))
+    enc_input = src_word_emb + src_pos_enc
+    enc_input = layers.reshape(
+        x=enc_input,
+        shape=[-1, src_max_len, src_emb_dim],
+        actual_shape=src_data_shape)
+    return layers.dropout(
+        enc_input, dropout_prob=dropout_rate,
+        is_test=False) if dropout_rate else enc_input
+
+
+prepare_encoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
+prepare_decoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.,
+                  pre_softmax_shape=None,
+                  post_softmax_shape=None):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(
+        enc_input, enc_input, enc_input, attn_bias, d_key, d_value, d_model,
+        n_head, dropout_rate, pre_softmax_shape, post_softmax_shape)
+    attn_output = post_process_layer(enc_input, attn_output, "dan",
+                                     dropout_rate)
+    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
+    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.,
+            pre_softmax_shape=None,
+            post_softmax_shape=None):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate,
+            pre_softmax_shape,
+            post_softmax_shape, )
+        enc_input = enc_output
+    return enc_output
+
+
+def decoder_layer(dec_input,
+                  enc_output,
+                  slf_attn_bias,
+                  dec_enc_attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.,
+                  slf_attn_pre_softmax_shape=None,
+                  slf_attn_post_softmax_shape=None,
+                  src_attn_pre_softmax_shape=None,
+                  src_attn_post_softmax_shape=None):
+    """ The layer to be stacked in decoder part.
+    The structure of this module is similar to that in the encoder part except
+    a multi-head attention is added to implement encoder-decoder attention.
+    """
+    slf_attn_output = multi_head_attention(
+        dec_input,
+        dec_input,
+        dec_input,
+        slf_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape, )
+    slf_attn_output = post_process_layer(
+        dec_input,
+        slf_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    enc_attn_output = multi_head_attention(
+        slf_attn_output,
+        enc_output,
+        enc_output,
+        dec_enc_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate,
+        src_attn_pre_softmax_shape,
+        src_attn_post_softmax_shape, )
+    enc_attn_output = post_process_layer(
+        slf_attn_output,
+        enc_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    ffd_output = positionwise_feed_forward(
+        enc_attn_output,
+        d_inner_hid,
+        d_model, )
+    dec_output = post_process_layer(
+        enc_attn_output,
+        ffd_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    return dec_output
+
+
+def decoder(dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.,
+            slf_attn_pre_softmax_shape=None,
+            slf_attn_post_softmax_shape=None,
+            src_attn_pre_softmax_shape=None,
+            src_attn_post_softmax_shape=None):
+    """
+    The decoder is composed of a stack of identical decoder_layer layers.
+    """
+    for i in range(n_layer):
+        dec_output = decoder_layer(
+            dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate,
+            slf_attn_pre_softmax_shape,
+            slf_attn_post_softmax_shape,
+            src_attn_pre_softmax_shape,
+            src_attn_post_softmax_shape, )
+        dec_input = dec_output
+    return dec_output
+
+
+def make_all_inputs(input_fields):
+    """
+    Define the input data layers for the transformer model.
+    """
+    inputs = []
+    for input_field in input_fields:
+        input_var = layers.data(
+            name=input_field,
+            shape=input_descs[input_field][0],
+            dtype=input_descs[input_field][1],
+            append_batch_size=False)
+        inputs.append(input_var)
+        fluid.default_startup_program().global_block().clone_variable(
+            input_var)
+    return inputs
+
+
+def transformer(
+        src_vocab_size,
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        label_smooth_eps, ):
+    enc_inputs = make_all_inputs(encoder_data_input_fields +
+                                 encoder_util_input_fields)
+
+    enc_output = wrap_encoder(
+        src_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        enc_inputs, )
+
+    dec_inputs = make_all_inputs(decoder_data_input_fields[:-1] +
+                                 decoder_util_input_fields)
+
+    predict = wrap_decoder(
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        dec_inputs,
+        enc_output, )
+
+    # Padding index do not contribute to the total loss. The weights is used to
+    # cancel padding index in calculating the loss.
+    label, weights = make_all_inputs(label_data_input_fields)
+    if label_smooth_eps:
+        label = layers.label_smooth(
+            label=layers.one_hot(
+                input=label, depth=trg_vocab_size),
+            epsilon=label_smooth_eps)
+    cost = layers.softmax_with_cross_entropy(
+        logits=predict,
+        label=label,
+        soft_label=True if label_smooth_eps else False)
+    # cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
+    weighted_cost = cost * weights
+    sum_cost = layers.reduce_sum(weighted_cost)
+    token_num = layers.reduce_sum(weights)
+    avg_cost = sum_cost / token_num
+    return sum_cost, avg_cost, predict, token_num
+
+
+def wrap_encoder(src_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 dropout_rate,
+                 enc_inputs=None):
+    """
+    The wrapper assembles together all needed layers for the encoder.
+    """
+    if enc_inputs is None:
+        # This is used to implement independent encoder program in inference.
+        src_word, src_pos, src_slf_attn_bias, src_data_shape, \
+            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
+            make_all_inputs(encoder_data_input_fields +
+                                 encoder_util_input_fields)
+    else:
+        src_word, src_pos, src_slf_attn_bias, src_data_shape, \
+            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
+            enc_inputs
+    enc_input = prepare_encoder(
+        src_word,
+        src_pos,
+        src_vocab_size,
+        d_model,
+        max_length,
+        dropout_rate,
+        src_data_shape, )
+    enc_output = encoder(
+        enc_input,
+        src_slf_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape, )
+    return enc_output
+
+
+def wrap_decoder(trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 dropout_rate,
+                 dec_inputs=None,
+                 enc_output=None):
+    """
+    The wrapper assembles together all needed layers for the decoder.
+    """
+    if dec_inputs is None:
+        # This is used to implement independent decoder program in inference.
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            enc_output, trg_data_shape, slf_attn_pre_softmax_shape, \
+            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
+            src_attn_post_softmax_shape = make_all_inputs(
+            decoder_data_input_fields + decoder_util_input_fields)
+    else:
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, slf_attn_pre_softmax_shape, \
+            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
+            src_attn_post_softmax_shape = dec_inputs
+
+    dec_input = prepare_decoder(
+        trg_word,
+        trg_pos,
+        trg_vocab_size,
+        d_model,
+        max_length,
+        dropout_rate,
+        trg_data_shape, )
+    dec_output = decoder(
+        dec_input,
+        enc_output,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape,
+        src_attn_pre_softmax_shape,
+        src_attn_post_softmax_shape, )
+    # Return logits for training and probs for inference.
+    predict = layers.reshape(
+        x=layers.fc(input=dec_output,
+                    size=trg_vocab_size,
+                    bias_attr=False,
+                    num_flatten_dims=2),
+        shape=[-1, trg_vocab_size],
+        act="softmax" if dec_inputs is None else None)
+    return predict
diff --git a/transformer/optim.py b/transformer/optim.py
new file mode 100644
index 00000000..56b5af3b
--- /dev/null
+++ b/transformer/optim.py
@@ -0,0 +1,37 @@
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+
+class LearningRateScheduler(object):
+    """
+    Wrapper for learning rate scheduling as described in the Transformer paper.
+    LearningRateScheduler adapts the learning rate externally and the adapted
+    learning rate will be feeded into the main_program as input data.
+    """
+
+    def __init__(self,
+                 d_model,
+                 warmup_steps,
+                 learning_rate=0.001,
+                 current_steps=0,
+                 name="learning_rate"):
+        self.current_steps = current_steps
+        self.warmup_steps = warmup_steps
+        self.d_model = d_model
+        self.static_lr = learning_rate
+        self.learning_rate = layers.create_global_var(
+            name=name,
+            shape=[1],
+            value=float(learning_rate),
+            dtype="float32",
+            persistable=True)
+
+    def update_learning_rate(self):
+        self.current_steps += 1
+        lr_value = np.power(self.d_model, -0.5) * np.min([
+            np.power(self.current_steps, -0.5),
+            np.power(self.warmup_steps, -1.5) * self.current_steps
+        ])
+        return np.array([lr_value], dtype="float32")
diff --git a/transformer/run.xsh b/transformer/run.xsh
new file mode 100755
index 00000000..2f6f1ffd
--- /dev/null
+++ b/transformer/run.xsh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+cudaid=${transformer_cudaid:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true python train.py
diff --git a/transformer/train.py b/transformer/train.py
new file mode 100644
index 00000000..f1b3bfe1
--- /dev/null
+++ b/transformer/train.py
@@ -0,0 +1,279 @@
+import os
+import time
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+
+from model import transformer, position_encoding_init
+from optim import LearningRateScheduler
+from transformer_config import *
+from continuous_evaluation import train_avg_ppl_kpi, train_pass_duration_kpi
+
+
+def pad_batch_data(insts,
+                   pad_idx,
+                   n_head,
+                   is_target=False,
+                   is_label=False,
+                   return_attn_bias=True,
+                   return_max_len=True,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    num_token = reduce(
+        lambda x, y: x + y,
+        [len(inst) for inst in insts]) if return_num_token else 0
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array(
+        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, 1])]
+    if is_label:  # label weight
+        inst_weight = np.array([[1.] * len(inst) + [0.] * (max_len - len(inst))
+                                for inst in insts])
+        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
+    else:  # position data
+        inst_pos = np.array([
+            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+    if return_attn_bias:
+        if is_target:
+            # This is used to avoid attention on paddings and subsequent
+            # words.
+            slf_attn_bias_data = np.ones(
+                (inst_data.shape[0], max_len, max_len))
+            slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                [-1, 1, max_len, max_len])
+            slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                         [1, n_head, 1, 1]) * [-1e9]
+        else:
+            # This is used to avoid attention on paddings.
+            slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                           (max_len - len(inst))
+                                           for inst in insts])
+            slf_attn_bias_data = np.tile(
+                slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                [1, n_head, max_len, 1])
+        return_list += [slf_attn_bias_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        return_list += [num_token]
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
+                        trg_pad_idx, n_head, d_model):
+    """
+    Put all padded data needed by training into a dict.
+    """
+    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+
+    # These shape tensors are used in reshape_op.
+    src_data_shape = np.array([-1, src_max_len, d_model], dtype="int32")
+    trg_data_shape = np.array([-1, trg_max_len, d_model], dtype="int32")
+    src_slf_attn_pre_softmax_shape = np.array(
+        [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
+    src_slf_attn_post_softmax_shape = np.array(
+        [-1] + list(src_slf_attn_bias.shape[1:]), dtype="int32")
+    trg_slf_attn_pre_softmax_shape = np.array(
+        [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+    trg_slf_attn_post_softmax_shape = np.array(
+        [-1] + list(trg_slf_attn_bias.shape[1:]), dtype="int32")
+    trg_src_attn_pre_softmax_shape = np.array(
+        [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+    trg_src_attn_post_softmax_shape = np.array(
+        [-1] + list(trg_src_attn_bias.shape[1:]), dtype="int32")
+
+    lbl_word, lbl_weight, num_token = pad_batch_data(
+        [inst[2] for inst in insts],
+        trg_pad_idx,
+        n_head,
+        is_target=False,
+        is_label=True,
+        return_attn_bias=False,
+        return_max_len=False,
+        return_num_token=True)
+
+    data_input_dict = dict(
+        zip(data_input_names, [
+            src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
+            trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+        ]))
+    util_input_dict = dict(
+        zip(util_input_names, [
+            src_data_shape, src_slf_attn_pre_softmax_shape,
+            src_slf_attn_post_softmax_shape, trg_data_shape,
+            trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape,
+            trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape
+        ]))
+    return data_input_dict, util_input_dict, np.asarray(
+        [num_token], dtype="float32")
+
+
+def read_multiple(reader, count):
+    def __impl__():
+        res = []
+        for item in reader():
+            res.append(item)
+            if len(res) == count:
+                yield res
+                res = []
+
+        if len(res) == count:
+            yield res
+
+    return __impl__
+
+
+def main():
+    place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    sum_cost, avg_cost, predict, token_num = transformer(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
+        TrainTaskConfig.label_smooth_eps)
+
+    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
+                                         TrainTaskConfig.warmup_steps,
+                                         TrainTaskConfig.learning_rate)
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=lr_scheduler.learning_rate,
+        beta1=TrainTaskConfig.beta1,
+        beta2=TrainTaskConfig.beta2,
+        epsilon=TrainTaskConfig.eps)
+    optimizer.minimize(sum_cost)
+
+    dev_count = fluid.core.get_cuda_device_count()
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt16.train(ModelHyperParams.src_vocab_size,
+                                       ModelHyperParams.trg_vocab_size),
+            buf_size=100000),
+        batch_size=TrainTaskConfig.batch_size)
+
+    # Program to do validation.
+    test_program = fluid.default_main_program().clone()
+    with fluid.program_guard(test_program):
+        test_program = fluid.io.get_inference_program([avg_cost])
+    val_data = paddle.batch(
+        paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size,
+                                        ModelHyperParams.trg_vocab_size),
+        batch_size=TrainTaskConfig.batch_size)
+
+    def test(exe):
+        test_total_cost = 0
+        test_total_token = 0
+        test_data = read_multiple(reader=val_data, count=dev_count)
+        for batch_id, data in enumerate(test_data()):
+            feed_list = []
+            for place_id, data_buffer in enumerate(data):
+                data_input_dict, util_input_dict, _ = prepare_batch_input(
+                    data_buffer, data_input_names, util_input_names,
+                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
+                    ModelHyperParams.n_head, ModelHyperParams.d_model)
+                feed_list.append(
+                    dict(data_input_dict.items() + util_input_dict.items()))
+
+            outs = exe.run(feed=feed_list,
+                           fetch_list=[sum_cost.name, token_num.name])
+            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+            test_total_cost += sum_cost_val.sum()
+            test_total_token += token_num_val.sum()
+        test_avg_cost = test_total_cost / test_total_token
+        test_ppl = np.exp([min(test_avg_cost, 100)])
+        return test_avg_cost, test_ppl
+
+    # Initialize the parameters.
+    if TrainTaskConfig.ckpt_path:
+        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
+        lr_scheduler.current_steps = TrainTaskConfig.start_step
+    else:
+        exe.run(fluid.framework.default_startup_program())
+
+    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
+                                                                             -1] + label_data_input_fields
+    util_input_names = encoder_util_input_fields + decoder_util_input_fields
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        loss_name=sum_cost.name)
+
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        main_program=test_program,
+        share_vars_from=train_exe)
+
+    init = False
+    train_data = read_multiple(reader=train_data, count=dev_count)
+
+    for pass_id in xrange(TrainTaskConfig.pass_num):
+        pass_start_time = time.time()
+        for batch_id, data in enumerate(train_data()):
+            feed_list = []
+            total_num_token = 0
+            lr_rate = lr_scheduler.update_learning_rate()
+            for place_id, data_buffer in enumerate(data):
+                data_input_dict, util_input_dict, num_token = prepare_batch_input(
+                    data_buffer, data_input_names, util_input_names,
+                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
+                    ModelHyperParams.n_head, ModelHyperParams.d_model)
+                total_num_token += num_token
+                feed_list.append(
+                    dict(data_input_dict.items() + util_input_dict.items() +
+                         {lr_scheduler.learning_rate.name: lr_rate}.items()))
+
+                if not init:
+                    for pos_enc_param_name in pos_enc_param_names:
+                        tensor = position_encoding_init(
+                            ModelHyperParams.max_length + 1,
+                            ModelHyperParams.d_model)
+                        feed_list[place_id][pos_enc_param_name] = tensor
+            for feed_dict in feed_list:
+                feed_dict[
+                    sum_cost.name +
+                    "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray(
+                        [1.], dtype="float32")
+            outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
+                                 feed=feed_list)
+            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+            total_sum_cost = sum_cost_val.sum(
+            )  # sum the cost from multi devices
+            total_token_num = token_num_val.sum()
+            total_avg_cost = total_sum_cost / total_token_num
+            print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
+                  (pass_id, batch_id, total_sum_cost, total_avg_cost,
+                   np.exp([min(total_avg_cost, 100)])))
+            init = True
+        # Validate and save the model for inference.
+        val_avg_cost, val_ppl = test(test_exe)
+        pass_end_time = time.time()
+        time_consumed = pass_end_time - pass_start_time
+        print("pass_id = " + str(pass_id) + " time_consumed = " + str(
+            time_consumed))
+        if pass_id == TrainTaskConfig.pass_num - 1:
+            train_avg_ppl_kpi.add_record(np.array(val_ppl, dtype='float32'))
+            train_pass_duration_kpi.add_record(time_consumed)
+    train_avg_ppl_kpi.persist()
+    train_pass_duration_kpi.persist()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/transformer/transformer_config.py b/transformer/transformer_config.py
new file mode 100644
index 00000000..d37636d9
--- /dev/null
+++ b/transformer/transformer_config.py
@@ -0,0 +1,184 @@
+class TrainTaskConfig(object):
+    use_gpu = True
+    # the epoch number to train.
+    pass_num = 5
+    # the number of sequences contained in a mini-batch.
+    batch_size = 64
+    # the hyper parameters for Adam optimizer.
+    learning_rate = 0.001
+    beta1 = 0.9
+    beta2 = 0.98
+    eps = 1e-9
+    # the parameters for learning rate scheduling.
+    warmup_steps = 4000
+    # the flag indicating to use average loss or sum loss when training.
+    use_avg_cost = True
+    # the weight used to mix up the ground-truth distribution and the fixed
+    # uniform distribution in label smoothing when training.
+    # Set this as zero if label smoothing is not wanted.
+    label_smooth_eps = 0.1
+    # the directory for saving trained models.
+    model_dir = "trained_models"
+    # the directory for saving checkpoints.
+    ckpt_dir = "trained_ckpts"
+    # the directory for loading checkpoint.
+    # If provided, continue training from the checkpoint.
+    ckpt_path = None
+    # the parameter to initialize the learning rate scheduler.
+    # It should be provided if use checkpoints, since the checkpoint doesn't
+    # include the training step counter currently.
+    start_step = 0
+
+
+class InferTaskConfig(object):
+    use_gpu = True
+    # the number of examples in one run for sequence generation.
+    batch_size = 10
+    # the parameters for beam search.
+    beam_size = 5
+    max_length = 30
+    # the number of decoded sentences to output.
+    n_best = 1
+    # the flags indicating whether to output the special tokens.
+    output_bos = False
+    output_eos = False
+    output_unk = False
+    # the directory for loading the trained model.
+    model_path = 'trained_models/pass_10.infer.model'
+
+
+class ModelHyperParams(object):
+    # This model directly uses paddle.dataset.wmt16 in which <bos>, <eos> and
+    # <unk> token has alreay been added. As for the <pad> token, any token
+    # included in dict can be used to pad, since the paddings' loss will be
+    # masked out and make no effect on parameter gradients.
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <bos> token
+    bos_idx = 0
+    # index for <eos> token
+    eos_idx = 1
+    # index for <unk> token
+    unk_idx = 2
+    # max length of sequences.
+    # The size of position encoding table should at least plus 1, since the
+    # sinusoid position encoding starts from 1 and 0 can be used as the padding
+    # token for position encoding.
+    max_length = 50
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+# Here list the data shapes and data types of all inputs.
+# The shapes here act as placeholder and are set to pass the infer-shape in
+# compile time.
+input_descs = {
+    # The actual data shape of src_word is:
+    # [batch_size * max_src_len_in_batch, 1]
+    "src_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    # The actual data shape of src_pos is:
+    # [batch_size * max_src_len_in_batch, 1]
+    "src_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    # This input is used to remove attention weights on paddings in the
+    # encoder.
+    # The actual data shape of src_slf_attn_bias is:
+    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
+    "src_slf_attn_bias":
+    [(1, ModelHyperParams.n_head, (ModelHyperParams.max_length + 1),
+      (ModelHyperParams.max_length + 1)), "float32"],
+    # This shape input is used to reshape the output of embedding layer.
+    "src_data_shape": [(3L, ), "int32"],
+    # This shape input is used to reshape before softmax in self attention.
+    "src_slf_attn_pre_softmax_shape": [(2L, ), "int32"],
+    # This shape input is used to reshape after softmax in self attention.
+    "src_slf_attn_post_softmax_shape": [(4L, ), "int32"],
+    # The actual data shape of trg_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "trg_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    # The actual data shape of trg_pos is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "trg_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    # This input is used to remove attention weights on paddings and
+    # subsequent words in the decoder.
+    # The actual data shape of trg_slf_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
+    "trg_slf_attn_bias": [(1, ModelHyperParams.n_head,
+                           (ModelHyperParams.max_length + 1),
+                           (ModelHyperParams.max_length + 1)), "float32"],
+    # This input is used to remove attention weights on paddings of the source
+    # input in the encoder-decoder attention.
+    # The actual data shape of trg_src_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
+    "trg_src_attn_bias": [(1, ModelHyperParams.n_head,
+                           (ModelHyperParams.max_length + 1),
+                           (ModelHyperParams.max_length + 1)), "float32"],
+    # This shape input is used to reshape the output of embedding layer.
+    "trg_data_shape": [(3L, ), "int32"],
+    # This shape input is used to reshape before softmax in self attention.
+    "trg_slf_attn_pre_softmax_shape": [(2L, ), "int32"],
+    # This shape input is used to reshape after softmax in self attention.
+    "trg_slf_attn_post_softmax_shape": [(4L, ), "int32"],
+    # This shape input is used to reshape before softmax in encoder-decoder
+    # attention.
+    "trg_src_attn_pre_softmax_shape": [(2L, ), "int32"],
+    # This shape input is used to reshape after softmax in encoder-decoder
+    # attention.
+    "trg_src_attn_post_softmax_shape": [(4L, ), "int32"],
+    # This input is used in independent decoder program for inference.
+    # The actual data shape of enc_output is:
+    # [batch_size, max_src_len_in_batch, d_model]
+    "enc_output": [(1, (ModelHyperParams.max_length + 1),
+                    ModelHyperParams.d_model), "float32"],
+    # The actual data shape of label_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    # This input is used to mask out the loss of paddding tokens.
+    # The actual data shape of label_weight is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_weight": [(1 * (ModelHyperParams.max_length + 1), 1L), "float32"],
+}
+
+# Names of position encoding table which will be initialized externally.
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+# separated inputs for different usages.
+encoder_data_input_fields = (
+    "src_word",
+    "src_pos",
+    "src_slf_attn_bias", )
+encoder_util_input_fields = (
+    "src_data_shape",
+    "src_slf_attn_pre_softmax_shape",
+    "src_slf_attn_post_softmax_shape", )
+decoder_data_input_fields = (
+    "trg_word",
+    "trg_pos",
+    "trg_slf_attn_bias",
+    "trg_src_attn_bias",
+    "enc_output", )
+decoder_util_input_fields = (
+    "trg_data_shape",
+    "trg_slf_attn_pre_softmax_shape",
+    "trg_slf_attn_post_softmax_shape",
+    "trg_src_attn_pre_softmax_shape",
+    "trg_src_attn_post_softmax_shape", )
+label_data_input_fields = (
+    "lbl_word",
+    "lbl_weight", )
diff --git a/vgg16/continuous_evaluation.py b/vgg16/continuous_evaluation.py
new file mode 100644
index 00000000..24b09ddd
--- /dev/null
+++ b/vgg16/continuous_evaluation.py
@@ -0,0 +1,22 @@
+"""
+continuous_evaluation.py
+"""
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import AccKpi
+from kpi import CostKpi
+from kpi import DurationKpi
+
+cifar10_128_train_speed_kpi = AccKpi('cifar10_128_train_speed', 0.02, 0, actived=True)
+cifar10_128_gpu_memory_kpi = DurationKpi('cifar10_128_gpu_memory', 0.1, 0, actived=True)
+
+flowers_32_train_speed_kpi = AccKpi('flowers_32_train_speed', 0.02, 0, actived=True)
+flowers_32_gpu_memory_kpi = DurationKpi('flowers_32_gpu_memory', 0.1, 0, actived=True)
+
+tracking_kpis = [
+    cifar10_128_train_speed_kpi,
+    cifar10_128_gpu_memory_kpi,
+    flowers_32_train_speed_kpi,
+    flowers_32_gpu_memory_kpi,
+]
diff --git a/vgg16/get_gpu_data.py b/vgg16/get_gpu_data.py
new file mode 100644
index 00000000..1e391253
--- /dev/null
+++ b/vgg16/get_gpu_data.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+########################################################################
+# 
+# Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved
+# 
+########################################################################
+"""
+File: get_gpu_data.py
+Author: paddle(paddle@baidu.com)
+Date: 2018/04/02 15:57:14
+"""
+import argparse
+from continuous_evaluation import tracking_kpis
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+args = parser.parse_args()
+
+
+def save_gpu_data():
+    mem_list = []
+    with open('memory.txt', 'r') as f:
+        for i, data in enumerate(f.readlines()):
+            if i == 0:
+                continue
+            mem_list.append(int(data.split("\n")[0].split(" ")[0]))
+    gpu_memory_factor = None
+    for kpi in tracking_kpis:
+        if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size):
+            gpu_memory_kpi = kpi
+    gpu_memory_kpi.add_record(max(mem_list))
+    gpu_memory_kpi.persist()
+
+
+if __name__ == "__main__":
+    save_gpu_data()
diff --git a/vgg16/latest_kpis/cifar10_128_gpu_memory_factor.txt b/vgg16/latest_kpis/cifar10_128_gpu_memory_factor.txt
new file mode 100644
index 00000000..e6827254
--- /dev/null
+++ b/vgg16/latest_kpis/cifar10_128_gpu_memory_factor.txt
@@ -0,0 +1 @@
+[2198]
diff --git a/vgg16/latest_kpis/cifar10_128_train_speed_factor.txt b/vgg16/latest_kpis/cifar10_128_train_speed_factor.txt
new file mode 100644
index 00000000..49da8670
--- /dev/null
+++ b/vgg16/latest_kpis/cifar10_128_train_speed_factor.txt
@@ -0,0 +1 @@
+[735.5991821289062]
diff --git a/vgg16/latest_kpis/flowers_32_gpu_memory_factor.txt b/vgg16/latest_kpis/flowers_32_gpu_memory_factor.txt
new file mode 100644
index 00000000..590bed9c
--- /dev/null
+++ b/vgg16/latest_kpis/flowers_32_gpu_memory_factor.txt
@@ -0,0 +1 @@
+[8938]
diff --git a/vgg16/latest_kpis/flowers_32_train_speed_factor.txt b/vgg16/latest_kpis/flowers_32_train_speed_factor.txt
new file mode 100644
index 00000000..b767a1cd
--- /dev/null
+++ b/vgg16/latest_kpis/flowers_32_train_speed_factor.txt
@@ -0,0 +1 @@
+[51.00917434692383]
diff --git a/vgg16/model.py b/vgg16/model.py
new file mode 100644
index 00000000..fa5c25d5
--- /dev/null
+++ b/vgg16/model.py
@@ -0,0 +1,289 @@
+"""
+VGG16 benchmark in Fluid
+"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import commands
+import subprocess
+import threading
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+from continuous_evaluation import tracking_kpis
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    "--gpu_id",
+    type=int,
+    default=3,
+    help="The GPU Card Id. (default: %(default)d)")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    """
+    vgg16_bn_drop
+    """
+
+    def conv_block(input, num_filter, groups, dropouts):
+        """
+        conv_block
+        """
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    """
+    main
+    """
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    # test
+    def test(exe):
+        """
+        test
+        """
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(
+                map(lambda x: x[0].reshape(data_shape), data)).astype(
+                    "float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"pixel": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+        return test_accuracy.eval()
+
+    train_acc_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == '%s_%s_train_acc' % (args.data_set, args.batch_size):
+            train_acc_kpi = kpi
+    train_speed_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == '%s_%s_train_speed' % (args.data_set, args.batch_size):
+            train_speed_kpi = kpi
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(
+                map(lambda x: x[0].reshape(data_shape), data)).astype(
+                    "float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            accuracy.add(value=acc, weight=weight)
+            iters += 1
+            num_samples += len(y_data)
+            if (batch_id % 10) == 0:
+                print(
+                    "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                    (pass_id, iters, loss, acc)
+                )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        # pass_train_acc = accuracy.eval()
+        train_losses.append(loss)
+        train_accs.append(acc)
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        #train_acc_kpi.add_record(np.array(train_accs, dtype='float32'))
+        train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32'))
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        break
+
+#train_acc_kpi.persist()
+    train_speed_kpi.persist()
+
+
+def print_arguments():
+    """
+    print_arguments
+    """
+    print('----------- vgg Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def collect_gpu_memory_data(alive):
+    """
+    collect the GPU memory data
+    """
+    global is_alive
+    status, output = commands.getstatusoutput('rm -rf memory.txt')
+    if status == 0:
+        print('del memory.txt')
+    command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id
+    p = subprocess.Popen(command, shell=True)
+    if p.pid < 0:
+        print('Get GPU memory data error')
+    while (is_alive):
+        time.sleep(1)
+    p.kill()
+
+
+def save_gpu_data(mem_list):
+    gpu_memory_kpi = None
+    for kpi in tracking_kpis:
+        if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size):
+            gpu_memory_kpi = kpi
+    gpu_memory_kpi.add_record(max(mem_list))
+    gpu_memory_kpi.persist()
+
+
+if __name__ == "__main__":
+    print_arguments()
+    global is_alive
+    is_alive = True
+    collect_memory_thread = threading.Thread(
+        target=collect_gpu_memory_data, args=(is_alive, ))
+    collect_memory_thread.setDaemon(True)
+    collect_memory_thread.start()
+    main()
+    is_alive = False
diff --git a/vgg16/run.xsh b/vgg16/run.xsh
new file mode 100755
index 00000000..7984ae3f
--- /dev/null
+++ b/vgg16/run.xsh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+cudaid=${vgg16_cudaid:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+
+#cifar10 128
+FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=128 --data_set=cifar10  --iterations=300 --gpu_id=$cudaid
+python get_gpu_data.py --batch_size=128 --data_set=cifar10
+
+#flowers 32
+FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=32 --data_set=flowers  --iterations=100 --gpu_id=$cudaid
+python get_gpu_data.py --batch_size=32 --data_set=flowers
+for pid in $(ps -ef | grep nvidia-smi | grep -v grep | cut -c 9-15); do
+    echo $pid
+    kill -9 $pid
+done
diff --git a/vgg16_aws_dist/__init__.py b/vgg16_aws_dist/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vgg16_aws_dist/ce_runner.py b/vgg16_aws_dist/ce_runner.py
new file mode 100644
index 00000000..bbc19351
--- /dev/null
+++ b/vgg16_aws_dist/ce_runner.py
@@ -0,0 +1,350 @@
+import argparse
+import logging
+import sys, os
+import numpy as np
+import threading
+import copy
+import csv
+from aws_runner.client.train_command import TrainCommand
+
+# for ce env ONLY
+
+sys.path.append(os.environ['ceroot'])
+from continuous_evaluation import cluster_specs, kpis_map, generate_kpi_id, generate_cluster_id
+
+from aws_runner.client.abclient import Abclient
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+
+parser = argparse.ArgumentParser(description=__doc__)
+
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-east-2")
+
+parser.add_argument(
+    '--pserver_command',
+    type=str,
+    default="",
+    help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
+
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-west-2")
+
+parser.add_argument(
+    '--trainer_command',
+    type=str,
+    default="",
+    help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--action', type=str, default="create", help="create|cleanup|status")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_public_ip', type=str, help="master server public ip")
+
+parser.add_argument(
+    '--master_docker_image',
+    type=str,
+    default="putcn/paddle_aws_master:latest",
+    help="master docker image id")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+parser.add_argument(
+    '--online_mode',
+    type=str2bool,
+    default=False,
+    help="is client activly stays online")
+
+args = parser.parse_args()
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+class DataCollector(object):
+    _instance_store = {}
+    @classmethod
+    def get_instance_by_spec(cls, cluster_spec):
+        cluster_id = generate_cluster_id(cluster_spec)
+        if cluster_id not in cls._instance_store:
+            cls._instance_store[cluster_id] = cls(cluster_spec)
+        return cls._instance_store[cluster_id]
+    @classmethod
+    def persist_all(cls):
+        for _, collector in cls._instance_store.iteritems():
+            collector.persist()
+    @classmethod
+    def generate_csv(cls):
+        with open("report.csv", "w") as csvfile:
+            fieldnames = []
+            rows = []
+            for cluster_id, collector in cls._instance_store.iteritems():
+                row = {
+                    "cluster_spec": cluster_id
+                }
+                for metric_name, _ in collector.store.iteritems():
+                    if metric_name not in fieldnames:
+                        fieldnames.append(metric_name)
+                    row[metric_name] = collector.avg(metric_name)
+                    rows.append(row)
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            for row in rows:
+                writer.writerow(row)
+    def __init__(self, cluster_spec):
+        self.store = {}
+        self.metric_data_identifier = "**metrics_data: "
+        self.cluster_spec = cluster_spec
+        self.cluster_id = generate_cluster_id(cluster_spec)
+    def log_processor(self, source, log_type):
+        for msg in iter(source.readline, ""):
+            logging.info(self.cluster_id)
+            logging.info(msg)
+            if (msg.startswith(self.metric_data_identifier)):
+                logging.info("metric data found, parse and save it")
+                str_msg = msg.replace(self.metric_data_identifier, "")
+                metrics_raw = str_msg.split(",")
+                for metric in metrics_raw:
+                    metric_data = metric.split("=")
+                    self.save(metric_data[0], metric_data[1])
+    def save(self, key, val):
+        key = key.strip()
+        if isinstance(val, str):
+            val = val.strip()
+        if (key not in self.store):
+            self.store[key] = []
+        logging.info("going to save " + key + "=" + str(val) + "from " + self.cluster_id)
+        self.store[key].append(float(val))
+    def get(self, key):
+        if (key in self.store):
+            return self.store[key]
+        return None
+    def avg(self, key):
+        vals = self.get(key)
+        if vals is None:
+            return None
+        return sum(vals)/float(len(vals))
+    def persist(self):
+        for metric_name, _ in self.store.iteritems():
+            kpi_id = generate_kpi_id(metric_name, self.cluster_spec)
+            logging.info("going to persist kpi " + kpi_id)
+            if kpi_id in kpis_map:
+                kpi_instance = kpis_map[kpi_id]
+                kpi_instance.add_record(np.array(self.avg(metric_name), dtype='float32'))
+                kpi_instance.persist()
+                logging.info("done persisting kpi " + kpi_id)
+            else:
+                logging.info("no such kpi id found in map!!!")
+                logging.info(kpi_id)
+
+def train_with_spec(spec, args, lock):
+    logging.info("updating cluster config and starting client")
+    test_name = spec[0]
+    batch_size = spec[1]
+    args.trainer_count = spec[2]
+    gpus_per_trainer_count = spec[3]
+    args.pserver_count = spec[4]
+    trainer_command = TrainCommand(args.trainer_command)
+
+    command_to_update = {
+        "model": test_name,
+        "batch_size": str(batch_size),
+        "gpus": str(gpus_per_trainer_count),
+    }
+
+    if args.pserver_count == 0 and args.trainer_count == 1:
+        command_to_update["update_method"] = "local"
+    ''' not yet supported because aws runner can't provide PADDLE_TRAINER_IPS
+    if args.pserver_count == 0 and args.trainer_count > 1:
+        command_to_update["update_method"] = "nccl2"
+    '''
+
+    trainer_command.update(command_to_update)
+    args.trainer_command = trainer_command.unparse()
+    args.pserver_command = args.trainer_command
+
+    data_collector = DataCollector.get_instance_by_spec(spec)
+
+    logging.info(args)
+    abclient = Abclient(args, data_collector.log_processor, lock)
+    abclient.create()
+
+'''
+ClusterIterator relies on spec structure as follows
+ batch_size, trainer_count, gpus_per_trainer_count, pserver_count
+ cluster_specs = [
+    [64, 1, 1, 0],
+    [64, 8, 1, 8],
+    [64, 16, 1, 8],
+    [64, 32, 1, 8],
+ ]
+ it will sequentially distribute specs into chunks and make sure each chunk
+ does not exceeds trainer and pserver count limit
+ above specs will be distributed into 2 chunks
+[[64, 1, 1, 0], [64, 8, 1, 8]]
+and 
+[[64, 16, 1, 8]]
+
+[64, 32, 1, 8] itself does not fit in a single chunk, thus gets discard
+
+'''
+class ClusterIterator:
+    def __init__(self, specs, trainer_count_threshold = 32, pserver_count_threshold = 10):
+        self.specs = specs
+        self.trainer_count_threshold = trainer_count_threshold
+        self.pserver_count_threshold = pserver_count_threshold
+        self.bad_specs = []
+    def __iter__(self):
+        return self
+    def spec_can_not_fit(self, trainer_count, pserver_count):
+        return (trainer_count > self.trainer_count_threshold or pserver_count > self.pserver_count_threshold)
+    def next(self):
+        specs_to_ret = []
+        trainer_count = 0
+        pserver_count = 0
+        if len(self.specs) == 0:
+            raise StopIteration()
+        else:
+            while len(self.specs) != 0:
+                next_spec = self.specs[0]
+                # when single spec can't even fit, move it to bad spec list
+                if self.spec_can_not_fit(next_spec[2], next_spec[4]):
+                    self.bad_specs.append(self.specs.pop(0))
+                    continue
+                trainer_count += next_spec[2]
+                pserver_count += next_spec[4]
+                if self.spec_can_not_fit(trainer_count, pserver_count):
+                    break
+                specs_to_ret.append(self.specs.pop(0))
+        if len(specs_to_ret) == 0:
+            if len(self.bad_specs) != 0:
+                logging.info("%d specs not be able to fit in any test chunk" % len(self.bad_specs))
+            raise StopIteration()
+        return specs_to_ret
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        lock = threading.Lock()
+        cluster_specs_origin = copy.copy(cluster_specs)
+        for specs in ClusterIterator(cluster_specs):
+            logging.info("starting a new chunk of test")
+            testing_threads = []
+            for cluster_spec in specs:
+                logging.info("creating cluster thread with spec")
+                logging.info(cluster_spec)
+                thread = threading.Thread(
+                    target=train_with_spec,
+                    args=(cluster_spec, copy.copy(args), lock,)
+                )
+                testing_threads.append(thread)
+
+            for testing_thread in testing_threads:
+                testing_thread.start()
+            
+            for testing_thread in testing_threads:
+                testing_thread.join()
+            logging.info("testing chunk ended")
+        
+        logging.info("all testing ended")
+        
+        # generate speedup rate
+        # 0 spec is the baseline
+        def get_speed_and_collector_by_spec(spec):
+            data_collector = DataCollector.get_instance_by_spec(spec)
+            return data_collector.avg("train_speed"), data_collector
+
+        logging.info("generating speedup")
+
+        # base_speed supposed to be one trainer, one gpu, local mode
+        base_speed, _ = get_speed_and_collector_by_spec(cluster_specs_origin[0])
+        if base_speed is not None:
+            logging.info("base speed is %f" % base_speed)
+            if base_speed is not None:
+                for cluster_spec in cluster_specs_origin:
+                    speed, data_collector = get_speed_and_collector_by_spec(cluster_spec)
+                    if speed is not None:
+                        # speed * trainer_count / base_speed
+                        data_collector.save("speedup", speed*cluster_spec[2]/base_speed)
+        else:
+            logging.info("base speed is not available")
+
+        DataCollector.persist_all()
+        # DataCollector.generate_csv()
+
diff --git a/vgg16_aws_dist/continuous_evaluation.py b/vgg16_aws_dist/continuous_evaluation.py
new file mode 100644
index 00000000..dea1aa0e
--- /dev/null
+++ b/vgg16_aws_dist/continuous_evaluation.py
@@ -0,0 +1,44 @@
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import LessWorseKpi, GreaterWorseKpi
+
+kpis_specs = {
+    "speedup": [LessWorseKpi, 0.01],
+    "train_speed":[LessWorseKpi, 0.01],
+    # "converge_speed":[GreaterWorseKpi, 0.01],
+    # "gpu_memory":[GreaterWorseKpi, 0.01],
+    # "acc_4passes":[GreaterWorseKpi, 0.01],
+}
+
+# each row represets a cluster setting with the following columns
+# test_name, batch_size, trainer_count, gpus_per_trainer_count, pserver_count
+# disable production cluster config for now
+# cluster_specs = [
+#    ["mnist", 64, 1, 1, 0],
+#    ["mnist", 64, 8, 1, 8],
+#    ["mnist", 64, 16, 1, 8],
+#    ["mnist", 64, 32, 1, 8],
+# ]
+
+cluster_specs = [
+    ["vgg", 16, 1, 1, 0],
+    ["vgg", 16, 4, 4, 4],
+    ["vgg", 16, 7, 8, 7],
+]
+
+kpis_map = {}
+
+tracking_kpis = []
+
+def generate_cluster_id(cluster_spec):
+    return "_".join(map(str, cluster_spec))
+def generate_kpi_id(kpi_name, cluster_spec):
+    return kpi_name + "_" + generate_cluster_id(cluster_spec)
+
+for kpi_type_name, (Kpi_class, diff_thre) in kpis_specs.items():
+    for cluster_spec in cluster_specs:
+        kpi_id = generate_kpi_id(kpi_type_name, cluster_spec)
+        the_kpi = Kpi_class(kpi_id, diff_thre)
+        tracking_kpis.append(the_kpi)
+        kpis_map[kpi_id] = the_kpi
\ No newline at end of file
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/Dockerfile b/vgg16_aws_dist/fluid_benchmark_for_aws/Dockerfile
new file mode 100644
index 00000000..bef80bb6
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/Dockerfile
@@ -0,0 +1,7 @@
+FROM paddlepaddlece/paddle:latest
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN apt install -y python-opencv
+ENTRYPOINT ["python", "fluid_benchmark.py"]
\ No newline at end of file
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/README.md b/vgg16_aws_dist/fluid_benchmark_for_aws/README.md
new file mode 100644
index 00000000..357ce932
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/README.md
@@ -0,0 +1,73 @@
+# Fluid Benchmark
+
+Originally from https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid
+
+This directory contains several models configurations and tools that used to run
+Fluid benchmarks for local and distributed training.
+
+
+## Run the Benchmark
+
+To start, run the following command to get the full help message:
+
+```bash
+python fluid_benchmark.py --help
+```
+
+Currently supported `--model` argument include:
+
+* mnist
+* resnet
+    * you can chose to use different dataset using `--data_set cifar10` or
+      `--data_set flowers`.
+* vgg
+* stacked_dynamic_lstm
+* machine_translation
+
+* Run the following command to start a benchmark job locally:
+    ```bash
+      python fluid_benchmark.py --model mnist  --device GPU
+    ```
+    You can choose to use GPU/CPU training. With GPU training, you can specify
+    `--gpus <gpu_num>` to run multi GPU training.
+* Run distributed training with parameter servers:
+    * start parameter servers:
+        ```bash
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        ```
+    * start trainers:
+        ```bash
+        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        ```
+* Run distributed training using NCCL2
+    ```bash
+    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
+    ```
+
+## Run Distributed Benchmark on Kubernetes Cluster
+
+We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
+distributed benchmark jobs to your cluster. To generate a job yaml, just run:
+
+```bash
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
+```
+
+Then the yaml files are generated under directory `myjob`, you can run:
+
+```bash
+kubectl create -f myjob/
+```
+
+The job shall start.
+
+
+## Notes for Run Fluid Distributed with NCCL2 and RDMA
+
+Before running NCCL2 distributed jobs, please check that whether your node has multiple network
+interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
+network device.
+
+To run high-performance distributed training, you must prepare your hardware environment to be
+able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
+note for details.
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/fluid_benchmark.py b/vgg16_aws_dist/fluid_benchmark_for_aws/fluid_benchmark.py
new file mode 100644
index 00000000..0f780a49
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/fluid_benchmark.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import cProfile
+import time
+import os
+
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
+
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.001,
+        help='The minibatch size.')
+    # TODO(wuyi): add "--use_fake_data" option back.
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_false',
+        help='If set, test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+
+    parser.add_argument(
+    "--acc_target", default=0.6, type=float, help="trianing will be terminated when acc_target reaches")
+
+    args = parser.parse_args()
+    return args
+
+
+def append_nccl2_prepare(trainer_id):
+    if trainer_id >= 0:
+        # append gen_nccl_id at the end of startup program
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        port = os.getenv("PADDLE_PSERVER_PORT")
+        worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+        worker_endpoints = []
+        for ip in worker_ips.split(","):
+            worker_endpoints.append(':'.join([ip, port]))
+        num_trainers = len(worker_endpoints)
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+        worker_endpoints.remove(current_endpoint)
+
+        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+            name="NCCLID",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        fluid.default_startup_program().global_block().append_op(
+            type="gen_nccl_id",
+            inputs={},
+            outputs={"NCCLID": nccl_id_var},
+            attrs={
+                "endpoint": current_endpoint,
+                "endpoint_list": worker_endpoints,
+                "trainer_id": trainer_id
+            })
+        return nccl_id_var, num_trainers, trainer_id
+    else:
+        raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
+                        "nccl-based dist train.")
+
+
+def dist_transpile(trainer_id):
+    if trainer_id < 0:
+        return None, None
+
+    # the port of all pservers, needed by both trainer and pserver
+    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+    # comma separated ips of all pservers, needed by trainer and
+    # pserver
+    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+    eplist = []
+    for ip in pserver_ips.split(","):
+        eplist.append(':'.join([ip, port]))
+    pserver_endpoints = ",".join(eplist)
+    # total number of workers/trainers in the job, needed by
+    # trainer and pserver
+    trainers = int(os.getenv("PADDLE_TRAINERS"))
+    # the IP of the local machine, needed by pserver only
+    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+    # the role, should be either PSERVER or TRAINER
+    training_role = os.getenv("PADDLE_TRAINING_ROLE")
+
+    t = distribute_transpiler.DistributeTranspiler()
+    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+    if training_role == "PSERVER":
+        pserver_program = t.get_pserver_program(current_endpoint)
+        pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                        pserver_program)
+        return pserver_program, pserver_startup_program
+    elif training_role == "TRAINER":
+        train_program = t.get_trainer_program()
+        return train_program, fluid.default_startup_program()
+    else:
+        raise ValueError(
+            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+        )
+
+
+def test(exe, inference_program, test_reader, feeder, batch_acc):
+    accuracy_evaluator = fluid.metrics.Accuracy()
+    for batch_id, data in enumerate(test_reader()):
+        acc = exe.run(inference_program,
+                      feed=feeder.feed(data),
+                      fetch_list=[batch_acc])
+        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+
+    return accuracy_evaluator.eval()
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, batch_size_tensor,
+          args, train_prog, startup_prog):
+    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+        place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(train_prog)
+        return
+
+    if args.use_fake_data:
+        raise Exception(
+            "fake data is not supported in single GPU test for now.")
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+    feed_var_list = [
+        var for var in train_prog.global_block().vars.itervalues()
+        if var.is_data
+    ]
+    feeder = fluid.DataFeeder(feed_var_list, place)
+
+    acc_4passes = None
+    converge_speed = None
+    train_pass_acc = fluid.average.WeightedAverage()
+    fetch_list = [avg_loss]
+    if batch_acc is not None:
+        fetch_list.append(batch_acc)
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_losses = []
+        train_pass_acc.reset()
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            outs = exe.run(train_prog,
+                           feed=feeder.feed(data),
+                           fetch_list=fetch_list)
+            iters += 1
+            num_samples += len(data)
+            loss = outs[0]
+            if batch_acc is not None:
+                acc = np.mean(outs[1]).item()
+                train_pass_acc.add(value=acc, weight=len(data))
+            else:
+                acc = None
+            train_losses.append(loss)
+            print("Pass: %d, Iter: %d, Loss: %f, acc %s\n" %
+                  (pass_id, iters, np.mean(train_losses), str(acc)))
+            if converge_speed is None and args.acc_target and acc >= args.acc_target:
+                converge_speed = time.time() - start_time
+                print("converge_speed set with %f" % converge_speed)
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        if batch_acc is not None:
+            pass_train_acc = train_pass_acc.eval()
+        else:
+            pass_train_acc = None
+
+        if pass_id == 4 and batch_acc is not None:
+            print("acc_4passes set with %f" % pass_train_acc)
+            acc_4passes = float(pass_train_acc)
+
+        output_metric_data(pass_id, examples_per_sec, pass_train_acc, acc_4passes, converge_speed)
+
+        # evaluation
+        if not args.no_test and batch_acc != None:
+            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
+                                 batch_acc)
+            print(", Test Accuracy: %f" % pass_test_acc)
+        print("\n")
+        # TODO(wuyi): add warmup passes to get better perf data.
+        exit(0)
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
+                   batch_acc, batch_size_tensor, args, train_prog, startup_prog, nccl_id_var,
+                   num_trainers, trainer_id):
+    feed_var_list = [
+        var for var in train_prog.global_block().vars.itervalues()
+        if var.is_data
+    ]
+    # generate fake:
+    if args.use_fake_data:
+        for var in feed_var_list:
+            v = startup_prog.global_block().clone_variable(var)
+            var.persistable = True
+            v.persistable = True
+
+            real_shape = list(var.shape)
+            real_shape[0] = args.batch_size / args.gpus
+            startup_prog.global_block().append_op(
+                outputs={"Out": v},
+                type="fill_constant",
+                attrs={"shape": real_shape,
+                       "value": 1.0,
+                       "dtype": var.dtype})
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if nccl_id_var and trainer_id == 0:
+        #FIXME(wuyi): wait other trainer to start listening
+        time.sleep(30)
+
+    startup_exe = fluid.Executor(place)
+    startup_exe.run(startup_prog)
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+    strategy.allow_op_delay = False
+    exe = fluid.ParallelExecutor(
+        True,
+        avg_loss.name,
+        exec_strategy=strategy,
+        num_trainers=num_trainers,
+        trainer_id=trainer_id)
+
+    feeder = fluid.DataFeeder(feed_var_list, place)
+    acc_4passes = None
+    converge_speed = None
+    accuracy_evaluator = fluid.metrics.Accuracy()
+    fetch_list = [avg_loss.name]
+    if batch_acc is not None:
+        fetch_list.append(batch_acc.name)
+    start_time = time.time()
+
+    for pass_id in range(args.pass_num):
+        num_samples = 0
+        iters = 0
+        pass_start_time = time.time()
+        accuracy_evaluator.reset()
+        for batch_id, data in enumerate(train_reader()):
+            if args.profile and pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif args.profile and pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
+
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            if args.use_fake_data:
+                outs = exe.run(fetch_list)
+            else:
+                outs = exe.run(fetch_list, feed=feeder.feed(data))   
+
+            if args.update_method == "pserver":
+                exe.bcast_params()
+            num_samples += len(data)
+            iters += 1
+
+            if batch_acc is not None:
+                acc = np.mean(outs[1]).item()
+                accuracy_evaluator.update(value=acc, weight=len(data))
+            else:
+                acc = None
+
+            if batch_id % 1 == 0:
+                print("Pass %d, batch %d, loss %s, acc %s" %
+                      (pass_id, batch_id, np.mean(outs[0]), str(acc)))
+            if converge_speed is None and args.acc_target and acc >= args.acc_target:
+                    converge_speed = time.time() - start_time
+                    print("converge_speed set with %f" % converge_speed)
+        
+        pass_elapsed = time.time() - pass_start_time
+        examples_per_sec = num_samples / pass_elapsed
+        if batch_acc is not None:
+            pass_train_acc = accuracy_evaluator.eval()
+        else:
+            pass_train_acc = None
+
+        if pass_id == 4 and batch_acc is not None:
+            print("acc_4passes set with %f" % pass_train_acc)
+            acc_4passes = float(pass_train_acc)
+
+        output_metric_data(pass_id, examples_per_sec, pass_train_acc, acc_4passes, converge_speed)
+
+        if not args.no_test and batch_acc != None:
+            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
+                            batch_acc)
+            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        exit(0)
+
+def output_metric_data(pass_id, examples_per_sec, pass_train_acc, acc_4passes, converge_speed):
+    msgs = []
+    msgs.append("pass = %d" % pass_id)
+    msgs.append("train_speed = %f" % float(examples_per_sec))
+    if isinstance(pass_train_acc, float):
+        msgs.append("train_accuracy = %f" % pass_train_acc)
+    if isinstance(acc_4passes, float):
+        msgs.append("acc_4passes = %f" % acc_4passes)
+    if isinstance(converge_speed, float):
+        msgs.append("converge_speed = %f" % converge_speed)
+    print("**metrics_data: " + ", ".join(msgs))
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- resnet Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def main():
+    args = parse_args()
+    print_arguments(args)
+
+    # the unique trainer id, starting from 0, needed by trainer
+    # only
+    nccl_id_var, num_trainers, trainer_id = (
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
+
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    model_def = __import__("models.%s" % args.model, fromlist=["models"])
+    train_args = list(model_def.get_model(args))
+    train_args.append(args)
+    # Run optimizer.minimize(avg_loss)
+    train_args[2].minimize(train_args[0])
+    if args.memory_optimize:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    if args.update_method == "pserver":
+        train_prog, startup_prog = dist_transpile(trainer_id)
+        if not train_prog:
+            raise Exception(
+                "Must configure correct environments to run dist train.")
+        train_args.extend([train_prog, startup_prog])
+        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
+            train_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*train_args)
+        train(*train_args)
+        exit(0)
+
+    # for other update methods, use default programs
+    train_args.append(fluid.default_main_program())
+    train_args.append(fluid.default_startup_program())
+
+    if args.update_method == "nccl2":
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
+    if args.gpus == 1:
+        # NOTE: parallel executor use profiler interanlly
+        if args.use_nvprof and args.device == 'GPU':
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                train(*train_args)
+        else:
+            train(*train_args)
+    else:
+        if args.device == "CPU":
+            raise Exception("Only support GPU perf with parallel exe")
+        train_args.extend([nccl_id_var, num_trainers, trainer_id])
+        train_parallel(*train_args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/kube_gen_job.py b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_gen_job.py
new file mode 100644
index 00000000..39ba207f
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_gen_job.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import copy
+import argparse
+import random
+import os
+from kube_templates import pserver, trainer, envs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Generate dist job yamls.')
+
+    parser.add_argument(
+        '--jobname', default="paddlejob", help='unique job name')
+    parser.add_argument(
+        '--cpu', default=1, type=int, help='CPU cores per trainer node')
+    parser.add_argument(
+        '--pscpu', default=1, type=int, help='CPU cores per pserver node')
+    parser.add_argument(
+        '--gpu', default=0, type=int, help='num of GPUs per node')
+    parser.add_argument(
+        '--image',
+        default="bootstrapper:5000/fluid_benchmark:gpu",
+        help='num of GPUs per node')
+    parser.add_argument(
+        '--pservers', default=1, type=int, help='num of pservers')
+    parser.add_argument(
+        '--trainers', default=1, type=int, help='num of trainers')
+    parser.add_argument('--memory', default=1, type=int, help='trainer memory')
+    parser.add_argument(
+        '--psmemory', default=1, type=int, help='pserver memory')
+    parser.add_argument(
+        '--port', default=30236, type=int, help='num of trainers')
+    parser.add_argument(
+        '--entry', default="python train.py", help='command to run')
+    parser.add_argument(
+        '--fluid', default=1, type=int, help='whether is fluid job')
+    parser.add_argument(
+        '--rdma', action='store_ture', help='whether mount rdma libs')
+    parser.add_argument(
+        '--disttype',
+        default="pserver",
+        type=str,
+        choices=['pserver', 'nccl2', 'local'],
+        help='pserver or nccl2 or local')
+
+    args = parser.parse_args()
+    return args
+
+
+def gen_job():
+    ps = pserver
+    tn = trainer
+    args = parse_args()
+
+    ps_container = ps["spec"]["template"]["spec"]["containers"][0]
+    tn_container = tn["spec"]["template"]["spec"]["containers"][0]
+
+    if args.fluid == 1:
+        ps_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+        tn_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+    ps["metadata"]["name"] = args.jobname + "-pserver"
+    ps["spec"]["template"]["metadata"]["labels"][
+        "paddle-job-pserver"] = args.jobname
+    tn["metadata"]["name"] = args.jobname + "-trainer"
+    tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
+
+    ps_container["image"] = args.image
+    tn_container["image"] = args.image
+
+    ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
+    ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
+
+    tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
+    tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
+    if args.gpu > 0:
+        tn_container["resources"]["requests"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+        tn_container["resources"]["limits"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+
+    ps["spec"]["replicas"] = int(args.pservers)
+    tn["spec"]["parallelism"] = int(args.trainers)
+    tn["spec"]["completions"] = int(args.trainers)
+    ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
+    ps_container["ports"][0]["containerPort"] = args.port
+    spreadport = random.randint(40000, 60000)
+    tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
+    tn_container["ports"][0]["containerPort"] = spreadport
+
+    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
+    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "ENTRY", "value": args.entry})
+    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
+    # NOTE: these directories below are cluster specific, please modify
+    # this settings before you run on your own cluster.
+    envs.append({
+        "name": "LD_LIBRARY_PATH",
+        "value":
+        "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
+    })
+
+    volumes = [{
+        "name": "nvidia-driver",
+        "hostPath": {
+            "path": "/usr/local/nvidia/lib64"
+        }
+    }]
+    volumeMounts = [{
+        "mountPath": "/usr/local/nvidia/lib64",
+        "name": "nvidia-driver"
+    }]
+
+    if args.rdma:
+        volumes.extend([{
+            "name": "ibetc",
+            "hostPath": {
+                "path": "/etc/libibverbs.d"
+            }
+        }, {
+            "name": "iblibs",
+            "hostPath": {
+                "path": "/usr/local/rdma"
+            }
+        }, {
+            "name": "valgrind",
+            "hostPath": {
+                "path": "/usr/lib64/mlnx_ofed/valgrind"
+            }
+        }])
+        volumeMounts.extend([{
+            "mountPath": "/etc/libibverbs.d",
+            "name": "ibetc"
+        }, {
+            "mountPath": "/usr/local/rdma",
+            "name": "iblibs"
+        }, {
+            "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
+            "name": "valgrind"
+        }])
+        # append shm for NCCL2
+        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
+        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
+
+    tn["spec"]["template"]["spec"]["volumes"] = volumes
+    tn_container["volumeMounts"] = volumeMounts
+
+    ps_container["env"] = envs
+    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    tn_container["env"] = envs
+    if args.disttype == "pserver":
+        tn_container["env"].append({
+            "name": "TRAINING_ROLE",
+            "value": "TRAINER"
+        })
+    elif args.disttype == "nccl2" or args.disttype == "local":
+        # NCCL2 have no training role, set to plain WORKER
+        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+
+    os.mkdir(args.jobname)
+    if args.disttype == "pserver":
+        with open("%s/pserver.yaml" % args.jobname, "w") as fn:
+            yaml.dump(ps, fn)
+
+    with open("%s/trainer.yaml" % args.jobname, "w") as fn:
+        yaml.dump(tn, fn)
+
+
+if __name__ == "__main__":
+    gen_job()
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/__init__.py b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/__init__.py
new file mode 100644
index 00000000..2d09d940
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pserver import pserver
+from trainer import trainer
+
+__all__ = ["pserver", "trainer", "envs"]
+
+envs = [
+    # envs that don't need to change
+    {
+        "name": "GLOG_v",
+        "value": "0"
+    },
+    {
+        "name": "GLOG_logtostderr",
+        "value": "1"
+    },
+    {
+        "name": "TOPOLOGY",
+        "value": ""
+    },
+    {
+        "name": "TRAINER_PACKAGE",
+        "value": "/workspace"
+    },
+    {
+        "name": "PADDLE_INIT_NICS",
+        "value": "eth2"
+    },
+    {
+        "name": "NAMESPACE",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "metadata.namespace"
+            }
+        }
+    },
+    {
+        "name": "POD_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    },
+    {
+        "name": "PADDLE_CURRENT_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    }
+]
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/pserver.py b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/pserver.py
new file mode 100644
index 00000000..b54982c8
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/pserver.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pserver = {
+    "apiVersion": "extensions/v1beta1",
+    "kind": "ReplicaSet",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "replicas": 1,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job-pserver": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "containers": [{
+                    "name": "pserver",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_pserver"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/trainer.py b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/trainer.py
new file mode 100644
index 00000000..b915d31e
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/trainer.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+trainer = {
+    "apiVersion": "batch/v1",
+    "kind": "Job",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "parallelism": 4,
+        "completions": 4,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "restartPolicy": "Never",
+                "containers": [{
+                    "name": "trainer",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    # to let container set rlimit
+                    "securityContext": {
+                        "privileged": True
+                        # TODO(wuyi): use below specific cap instead of privileged,
+                        # using privileged will cause all GPU device are visible
+                        # in the container.
+                        # "capabilities": {
+                        #     "add": ["SYS_RESOURCE"]
+                        # }
+                    },
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_trainer", "v2"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/__init__.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/__init__.py
new file mode 100644
index 00000000..1c3fcac8
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/machine_translation.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/machine_translation.py
new file mode 100644
index 00000000..122a66c9
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/machine_translation.py
@@ -0,0 +1,232 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act=None,
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act=None,
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+                                    decoder_boot, decoder_size):
+        def simple_attention(encoder_vec, encoder_proj, decoder_state):
+            decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                                 size=decoder_size,
+                                                 bias_attr=False)
+            decoder_state_expand = fluid.layers.sequence_expand(
+                x=decoder_state_proj, y=encoder_proj)
+            concated = fluid.layers.concat(
+                input=[encoder_proj, decoder_state_expand], axis=1)
+            attention_weights = fluid.layers.fc(input=concated,
+                                                size=1,
+                                                act='tanh',
+                                                bias_attr=False)
+            attention_weights = fluid.layers.sequence_softmax(
+                input=attention_weights)
+            weigths_reshape = fluid.layers.reshape(
+                x=attention_weights, shape=[-1])
+            scaled = fluid.layers.elementwise_mul(
+                x=encoder_vec, y=weigths_reshape, axis=0)
+            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            return context
+
+        rnn = fluid.layers.DynamicRNN()
+
+        cell_init = fluid.layers.fill_constant_batch_size_like(
+            input=decoder_boot,
+            value=0.0,
+            shape=[-1, decoder_size],
+            dtype='float32')
+        cell_init.stop_gradient = False
+
+        with rnn.block():
+            current_word = rnn.step_input(target_embedding)
+            encoder_vec = rnn.static_input(encoder_vec)
+            encoder_proj = rnn.static_input(encoder_proj)
+            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+            cell_mem = rnn.memory(init=cell_init)
+            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+            decoder_inputs = fluid.layers.concat(
+                input=[context, current_word], axis=1)
+            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+            rnn.update_memory(hidden_mem, h)
+            rnn.update_memory(cell_mem, c)
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            rnn.output(out)
+        return rnn()
+
+    if not is_generating:
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+                                                 encoded_proj, decoder_boot,
+                                                 decoder_size)
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+        return avg_cost, feeding_list
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    lod_t = core.LoDTensor()
+    lod_t.set(flattened_data, place)
+    lod_t.set_lod([lod])
+    return lod_t, lod[-1]
+
+
+def lodtensor_to_ndarray(lod_tensor):
+    dims = lod_tensor.get_dims()
+    ndarray = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+    return ndarray
+
+
+def get_model(args):
+    embedding_dim = 512
+    encoder_size = 512
+    decoder_size = 512
+    dict_size = 30000
+    beam_size = 3
+    max_length = 250
+    avg_cost, feeding_list = seq_to_seq_net(
+        embedding_dim,
+        encoder_size,
+        decoder_size,
+        dict_size,
+        dict_size,
+        False,
+        beam_size=beam_size,
+        max_length=max_length)
+
+    # clone from default main program
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, train_batch_generator, \
+           test_batch_generator, None, None
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/mnist.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/mnist.py
new file mode 100644
index 00000000..9606304b
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/mnist.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import cProfile
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(args):
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = cnn_model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc, batch_size_tensor
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/resnet.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/resnet.py
new file mode 100644
index 00000000..34748e37
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/resnet.py
@@ -0,0 +1,161 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def get_model(args):
+    model = resnet_cifar10
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+    else:
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+
+    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    predict = model(input, class_dim)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc, batch_size_tensor
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/stacked_dynamic_lstm.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/stacked_dynamic_lstm.py
new file mode 100644
index 00000000..bd44a607
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/stacked_dynamic_lstm.py
@@ -0,0 +1,139 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle
+import paddle.dataset.imdb as imdb
+import paddle.fluid as fluid
+import paddle.batch as batch
+import paddle.fluid.profiler as profiler
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['<unk>']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def get_model(args):
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+
+    train_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+        batch_size=args.batch_size)
+    test_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
+        batch_size=args.batch_size)
+
+    return loss, inference_program, adam, train_reader, test_reader, batch_acc, batch_size_tensor
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/vgg.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/vgg.py
new file mode 100644
index 00000000..6571bbf6
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/vgg.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def get_model(args):
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc, batch_size_tensor
diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/run.sh b/vgg16_aws_dist/fluid_benchmark_for_aws/run.sh
new file mode 100644
index 00000000..f6dfd20b
--- /dev/null
+++ b/vgg16_aws_dist/fluid_benchmark_for_aws/run.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+
+#export FLAGS_fraction_of_gpu_memory_to_use=0.0
+export CUDNN_PATH=/paddle/cudnn_v5
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+# only query the gpu used
+nohup stdbuf -oL nvidia-smi \
+      --id=${CUDA_VISIBLE_DEVICES} \
+      --query-gpu=timestamp \
+      --query-compute-apps=pid,process_name,used_memory \
+      --format=csv \
+      --filename=mem.log  \
+      -l 1 &
+# mnist
+# mnist gpu mnist 128
+FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=500 \
+               2>&1 | tee -a mnist_gpu_128.log
+
+# vgg16
+# gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_128.log
+
+# flowers gpu  128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+               --device=GPU \
+               --batch_size=32 \
+               --data_set=flowers \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_flowers_32.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+               --device=GPU \
+               --batch_size=128 \
+               --data_set=cifar10 \
+               --model=resnet_cifar10 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a resnet50_gpu_128.log
+
+# resnet50 gpu flowers 64
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+               --device=GPU \
+               --batch_size=64 \
+               --data_set=flowers \
+               --model=resnet_imagenet \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a resnet50_gpu_flowers_64.log
+
+# lstm
+# lstm gpu imdb 32 # tensorflow only support batch=32
+FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+               --device=GPU \
+               --batch_size=32 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               --hidden_dim=512 \
+               --emb_dim=512 \
+               --crop_size=1500 \
+               2>&1 | tee -a lstm_gpu_32.log
+
+# seq2seq
+# seq2seq gpu wmb 128
+FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a lstm_gpu_128.log
diff --git a/vgg16_aws_dist/latest_kpis/speedup_rate_factor.txt b/vgg16_aws_dist/latest_kpis/speedup_rate_factor.txt
new file mode 100644
index 00000000..edf5775a
--- /dev/null
+++ b/vgg16_aws_dist/latest_kpis/speedup_rate_factor.txt
@@ -0,0 +1 @@
+[0.5]
\ No newline at end of file
diff --git a/vgg16_aws_dist/run.xsh b/vgg16_aws_dist/run.xsh
new file mode 100755
index 00000000..be239834
--- /dev/null
+++ b/vgg16_aws_dist/run.xsh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+set -xe
+
+CURRENT_FILE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PADDLE_PATH=$CURRENT_FILE_DIR/../../..
+paddle_build_path=$PADDLE_PATH/build
+paddle_docker_hub_tag="paddlepaddlece/paddle:latest"
+fluid_benchmark_dockerhub_tag="paddlepaddlece/fluid_benchmark:latest"
+training_command="update_method:pserver,acc_target:0.6,iterations:100,pass_num:1"
+
+# clean up docker
+docker system prune -f
+
+# loginto docker hub
+# login is now performed in teamcity
+# docker login -u $DOCKER_HUB_USERNAME -p $DOCKER_HUB_PASSWORD
+
+# create paddle docker image
+echo "going to build and push paddle production image"
+docker build -t $paddle_docker_hub_tag $paddle_build_path
+docker push $paddle_docker_hub_tag
+
+# build test docker image
+cd $CURRENT_FILE_DIR
+
+cd fluid_benchmark_for_aws
+if [ -d ~/.cache/paddle/dataset/cifar ]; then
+    echo "host cifar dataset cache found, copying it to docker root"
+    mkdir -p .cache/paddle/dataset/
+    cp -r -f ~/.cache/paddle/dataset/cifar .cache/paddle/dataset/
+fi
+
+if [ -d ~/.cache/paddle/dataset/flowers ]; then
+    echo "host flower dataset cache found, copying it to docker root"
+    mkdir -p .cache/paddle/dataset/
+    cp -r -f ~/.cache/paddle/dataset/flowers .cache/paddle/dataset/
+fi
+
+cd ..
+
+echo "going to build fluid_benchmark_for_aws docker image and push it"
+docker build -t $fluid_benchmark_dockerhub_tag ./fluid_benchmark_for_aws
+docker push $fluid_benchmark_dockerhub_tag
+
+# fetch runner and install dependencies
+echo "going to work with aws_runner"
+if [ ! -d aws_runner ]; then
+    echo "no aws_runner found, cloning one"
+    git clone https://github.com/putcn/aws_runner.git
+fi
+cd aws_runner
+git pull
+cd ..
+echo "going to install aws_runner dependencies"
+pip install -r aws_runner/client/requirements.txt
+
+echo "going to start testing"
+# start aws testingr
+python ce_runner.py \
+    --key_name aws_benchmark_us_east \
+    --security_group_id sg-95539dff \
+    --online_mode yes \
+    --pserver_command $training_command \
+    --trainer_command $training_command \
+    --docker_image $fluid_benchmark_dockerhub_tag
\ No newline at end of file
diff --git a/vgg16_aws_dist/speedup_vgg_16_1_1_0_factor.txt b/vgg16_aws_dist/speedup_vgg_16_1_1_0_factor.txt
new file mode 100644
index 00000000..e7a19a6e
--- /dev/null
+++ b/vgg16_aws_dist/speedup_vgg_16_1_1_0_factor.txt
@@ -0,0 +1 @@
+[1.0]
\ No newline at end of file
diff --git a/vgg16_aws_dist/speedup_vgg_16_4_4_4_factor.txt b/vgg16_aws_dist/speedup_vgg_16_4_4_4_factor.txt
new file mode 100644
index 00000000..3ea09272
--- /dev/null
+++ b/vgg16_aws_dist/speedup_vgg_16_4_4_4_factor.txt
@@ -0,0 +1 @@
+[10.233551979064941]
\ No newline at end of file
diff --git a/vgg16_aws_dist/speedup_vgg_16_7_8_7_factor.txt b/vgg16_aws_dist/speedup_vgg_16_7_8_7_factor.txt
new file mode 100644
index 00000000..c3f822e5
--- /dev/null
+++ b/vgg16_aws_dist/speedup_vgg_16_7_8_7_factor.txt
@@ -0,0 +1 @@
+[11.316923141479492]
\ No newline at end of file
diff --git a/vgg16_aws_dist/train_speed_vgg_16_1_1_0_factor.txt b/vgg16_aws_dist/train_speed_vgg_16_1_1_0_factor.txt
new file mode 100644
index 00000000..55d41345
--- /dev/null
+++ b/vgg16_aws_dist/train_speed_vgg_16_1_1_0_factor.txt
@@ -0,0 +1 @@
+[11.437457084655762]
\ No newline at end of file
diff --git a/vgg16_aws_dist/train_speed_vgg_16_4_4_4_factor.txt b/vgg16_aws_dist/train_speed_vgg_16_4_4_4_factor.txt
new file mode 100644
index 00000000..c133cf2f
--- /dev/null
+++ b/vgg16_aws_dist/train_speed_vgg_16_4_4_4_factor.txt
@@ -0,0 +1 @@
+[29.26145362854004]
\ No newline at end of file
diff --git a/vgg16_aws_dist/train_speed_vgg_16_7_8_7_factor.txt b/vgg16_aws_dist/train_speed_vgg_16_7_8_7_factor.txt
new file mode 100644
index 00000000..b0991782
--- /dev/null
+++ b/vgg16_aws_dist/train_speed_vgg_16_7_8_7_factor.txt
@@ -0,0 +1 @@
+[18.49097442626953]
\ No newline at end of file