diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..7684eed1 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,13 @@ +repos: +- repo: https://github.com/PaddlePaddle/mirrors-yapf.git + sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 + hooks: + - id: yapf + files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ +- repo: https://github.com/pre-commit/pre-commit-hooks + sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0 + hooks: + - id: check-added-large-files + - id: check-merge-conflict + - id: check-symlinks + - id: end-of-file-fixer diff --git a/README.md b/README.md index 63388821..7fb29933 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ ## Howtos +### Contribute +- Run `pre-commit run -a` before your PR, this will help to format code automatically + ### Add New Evaluation Task Reference [mnist task](https://github.com/Superjomn/paddle-ce-latest-kpis/tree/master/mnist), diff --git a/__ocr_recognition/continuous_evaluation.py b/__ocr_recognition/continuous_evaluation.py new file mode 100644 index 00000000..a4da1f67 --- /dev/null +++ b/__ocr_recognition/continuous_evaluation.py @@ -0,0 +1,12 @@ +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +train_avg_loss_kpi = CostKpi('train_avg_loss', 0.2, 0) +train_seq_err_kpi = CostKpi('train_seq_err', 0.2, 0) + +tracking_kpis = [ + train_avg_loss_kpi, + train_seq_err_kpi, +] diff --git a/__ocr_recognition/crnn_ctc_model.py b/__ocr_recognition/crnn_ctc_model.py new file mode 100644 index 00000000..df33100e --- /dev/null +++ b/__ocr_recognition/crnn_ctc_model.py @@ -0,0 +1,221 @@ +import paddle.fluid as fluid + + +def conv_bn_pool(input, + group, + out_ch, + act="relu", + param=None, + bias=None, + param_0=None, + is_test=False): + tmp = input + for i in xrange(group): + tmp = fluid.layers.conv2d( + input=tmp, + num_filters=out_ch[i], + filter_size=3, + padding=1, + param_attr=param if param_0 is None else param_0, + act=None, # LinearActivation + use_cudnn=True) + tmp = fluid.layers.batch_norm( + input=tmp, + act=act, + param_attr=param, + bias_attr=bias, + is_test=is_test) + tmp = fluid.layers.pool2d( + input=tmp, + pool_size=2, + pool_type='max', + pool_stride=2, + use_cudnn=True, + ceil_mode=True) + + return tmp + + +def ocr_convs(input, + num, + with_bn, + regularizer=None, + gradient_clip=None, + is_test=False): + assert (num % 4 == 0) + + b = fluid.ParamAttr( + regularizer=regularizer, + gradient_clip=gradient_clip, + initializer=fluid.initializer.Normal(0.0, 0.0)) + w0 = fluid.ParamAttr( + regularizer=regularizer, + gradient_clip=gradient_clip, + initializer=fluid.initializer.Normal(0.0, 0.0005)) + w1 = fluid.ParamAttr( + regularizer=regularizer, + gradient_clip=gradient_clip, + initializer=fluid.initializer.Normal(0.0, 0.01)) + tmp = input + tmp = conv_bn_pool( + tmp, 2, [16, 16], param=w1, bias=b, param_0=w0, is_test=is_test) + + tmp = conv_bn_pool(tmp, 2, [32, 32], param=w1, bias=b, is_test=is_test) + tmp = conv_bn_pool(tmp, 2, [64, 64], param=w1, bias=b, is_test=is_test) + tmp = conv_bn_pool(tmp, 2, [128, 128], param=w1, bias=b, is_test=is_test) + return tmp + + +def encoder_net(images, + num_classes, + rnn_hidden_size=200, + regularizer=None, + gradient_clip=None, + is_test=False): + conv_features = ocr_convs( + images, + 8, + True, + regularizer=regularizer, + gradient_clip=gradient_clip, + is_test=is_test) + sliced_feature = fluid.layers.im2sequence( + input=conv_features, + stride=[1, 1], + filter_size=[conv_features.shape[2], 1]) + + para_attr = fluid.ParamAttr( + regularizer=regularizer, + gradient_clip=gradient_clip, + initializer=fluid.initializer.Normal(0.0, 0.02)) + bias_attr = fluid.ParamAttr( + regularizer=regularizer, + gradient_clip=gradient_clip, + initializer=fluid.initializer.Normal(0.0, 0.02), + learning_rate=2.0) + bias_attr_nobias = fluid.ParamAttr( + regularizer=regularizer, + gradient_clip=gradient_clip, + initializer=fluid.initializer.Normal(0.0, 0.02)) + + fc_1 = fluid.layers.fc(input=sliced_feature, + size=rnn_hidden_size * 3, + param_attr=para_attr, + bias_attr=bias_attr_nobias) + fc_2 = fluid.layers.fc(input=sliced_feature, + size=rnn_hidden_size * 3, + param_attr=para_attr, + bias_attr=bias_attr_nobias) + + gru_forward = fluid.layers.dynamic_gru( + input=fc_1, + size=rnn_hidden_size, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu') + gru_backward = fluid.layers.dynamic_gru( + input=fc_2, + size=rnn_hidden_size, + is_reverse=True, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu') + + w_attr = fluid.ParamAttr( + regularizer=regularizer, + gradient_clip=gradient_clip, + initializer=fluid.initializer.Normal(0.0, 0.02)) + b_attr = fluid.ParamAttr( + regularizer=regularizer, + gradient_clip=gradient_clip, + initializer=fluid.initializer.Normal(0.0, 0.0)) + + fc_out = fluid.layers.fc(input=[gru_forward, gru_backward], + size=num_classes + 1, + param_attr=w_attr, + bias_attr=b_attr) + + return fc_out + + +def ctc_train_net(images, label, args, num_classes): + regularizer = fluid.regularizer.L2Decay(args.l2) + gradient_clip = None + if args.parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places, use_nccl=True) + with pd.do(): + images_ = pd.read_input(images) + label_ = pd.read_input(label) + + fc_out = encoder_net( + images_, + num_classes, + regularizer=regularizer, + gradient_clip=gradient_clip) + + cost = fluid.layers.warpctc( + input=fc_out, + label=label_, + blank=num_classes, + norm_by_times=True) + sum_cost = fluid.layers.reduce_sum(cost) + + decoded_out = fluid.layers.ctc_greedy_decoder( + input=fc_out, blank=num_classes) + + pd.write_output(sum_cost) + pd.write_output(decoded_out) + + sum_cost, decoded_out = pd() + sum_cost = fluid.layers.reduce_sum(sum_cost) + + else: + fc_out = encoder_net( + images, + num_classes, + regularizer=regularizer, + gradient_clip=gradient_clip) + + cost = fluid.layers.warpctc( + input=fc_out, label=label, blank=num_classes, norm_by_times=True) + sum_cost = fluid.layers.reduce_sum(cost) + decoded_out = fluid.layers.ctc_greedy_decoder( + input=fc_out, blank=num_classes) + + casted_label = fluid.layers.cast(x=label, dtype='int64') + error_evaluator = fluid.evaluator.EditDistance( + input=decoded_out, label=casted_label) + + inference_program = fluid.default_main_program().clone(for_test=True) + + optimizer = fluid.optimizer.Momentum( + learning_rate=args.learning_rate, momentum=args.momentum) + _, params_grads = optimizer.minimize(sum_cost) + model_average = fluid.optimizer.ModelAverage( + args.average_window, + params_grads, + min_average_window=args.min_average_window, + max_average_window=args.max_average_window) + + return sum_cost, error_evaluator, inference_program, model_average + + +def ctc_infer(images, num_classes): + fc_out = encoder_net(images, num_classes, is_test=True) + return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes) + + +def ctc_eval(images, label, num_classes): + fc_out = encoder_net(images, num_classes, is_test=True) + decoded_out = fluid.layers.ctc_greedy_decoder( + input=fc_out, blank=num_classes) + + casted_label = fluid.layers.cast(x=label, dtype='int64') + error_evaluator = fluid.evaluator.EditDistance( + input=decoded_out, label=casted_label) + + cost = fluid.layers.warpctc( + input=fc_out, label=label, blank=num_classes, norm_by_times=True) + + return error_evaluator, cost diff --git a/__ocr_recognition/ctc_reader.py b/__ocr_recognition/ctc_reader.py new file mode 100644 index 00000000..5e65ef42 --- /dev/null +++ b/__ocr_recognition/ctc_reader.py @@ -0,0 +1,201 @@ +import os +import cv2 +import tarfile +import numpy as np +from PIL import Image +from os import path +import paddle as paddle +from paddle.utils.image_util import load_image + +NUM_CLASSES = 10784 +DATA_SHAPE = [1, 48, 512] + +DATA_MD5 = "1de60d54d19632022144e4e58c2637b5" +DATA_URL = "http://cloud.dlnel.org/filepub/?uuid=df937251-3c0b-480d-9a7b-0080dfeee65c" +CACHE_DIR_NAME = "ctc_data" +SAVED_FILE_NAME = "data.tar.gz" +DATA_DIR_NAME = "data" +TRAIN_DATA_DIR_NAME = "train_images" +TEST_DATA_DIR_NAME = "test_images" +TRAIN_LIST_FILE_NAME = "train.list" +TEST_LIST_FILE_NAME = "test.list" + + +class DataGenerator(object): + def __init__(self): + pass + + def train_reader(self, img_root_dir, img_label_list, batchsize): + ''' + Reader interface for training. + + :param img_root_dir: The root path of the image for training. + :type img_root_dir: str + + :param img_label_list: The path of the file for training. + :type img_label_list: str + + ''' + + img_label_lines = [] + if batchsize == 1: + to_file = "tmp.txt" + cmd = "cat " + img_label_list + " | awk '{print $1,$2,$3,$4;}' | shuf > " + to_file + print "cmd: " + cmd + os.system(cmd) + print "finish batch shuffle" + img_label_lines = open(to_file, 'r').readlines() + else: + to_file = "tmp.txt" + #cmd1: partial shuffle + cmd = "cat " + img_label_list + " | awk '{printf(\"%04d%.4f %s\\n\", $1, rand(), $0)}' | sort | sed 1,$((1 + RANDOM % 100))d | " + #cmd2: batch merge and shuffle + cmd += "awk '{printf $2\" \"$3\" \"$4\" \"$5\" \"; if(NR % " + str( + batchsize) + " == 0) print \"\";}' | shuf | " + #cmd3: batch split + cmd += "awk '{if(NF == " + str( + batchsize + ) + " * 4) {for(i = 0; i < " + str( + batchsize + ) + "; i++) print $(4*i+1)\" \"$(4*i+2)\" \"$(4*i+3)\" \"$(4*i+4);}}' > " + to_file + print "cmd: " + cmd + os.system(cmd) + print "finish batch shuffle" + img_label_lines = open(to_file, 'r').readlines() + + def reader(): + sizes = len(img_label_lines) / batchsize + for i in range(sizes): + result = [] + sz = [0, 0] + for j in range(batchsize): + line = img_label_lines[i * batchsize + j] + # h, w, img_name, labels + items = line.split(' ') + + label = [int(c) for c in items[-1].split(',')] + img = Image.open(os.path.join(img_root_dir, items[ + 2])).convert('L') #zhuanhuidu + if j == 0: + sz = img.size + img = img.resize((sz[0], sz[1])) + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + result.append([img, label]) + yield result + + return reader + + def test_reader(self, img_root_dir, img_label_list): + ''' + Reader interface for inference. + + :param img_root_dir: The root path of the images for training. + :type img_root_dir: str + + :param img_label_list: The path of the file for testing. + :type img_label_list: str + ''' + + def reader(): + for line in open(img_label_list): + # h, w, img_name, labels + items = line.split(' ') + + label = [int(c) for c in items[-1].split(',')] + img = Image.open(os.path.join(img_root_dir, items[2])).convert( + 'L') + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + yield img, label + + return reader + + def infer_reader(self, img_root_dir=None, img_label_list=None): + '''A reader interface for inference. + + :param img_root_dir: The root path of the images for training. + :type img_root_dir: str + + :param img_label_list: The path of the file for + inference. It should be the path of file if img_root_dir + was None. If img_label_list was set to None, it will read image path + from stdin. + :type img_root_dir: str + ''' + + def reader(): + if img_label_list is not None: + for line in open(img_label_list): + if img_root_dir is not None: + # h, w, img_name, labels + img_name = line.split(' ')[2] + img_path = os.path.join(img_root_dir, img_name) + else: + img_path = line.strip("\t\n\r") + img = Image.open(img_path).convert('L') + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + yield img, label + else: + while True: + img_path = raw_input("Please input the path of image: ") + img = Image.open(img_path).convert('L') + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + yield img, [[0]] + + return reader + + +def num_classes(): + '''Get classes number of this dataset. + ''' + return NUM_CLASSES + + +def data_shape(): + '''Get image shape of this dataset. It is a dummy shape for this dataset. + ''' + return DATA_SHAPE + + +def train(batch_size, train_images_dir=None, train_list_file=None): + generator = DataGenerator() + if train_images_dir is None: + data_dir = download_data() + train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME) + if train_list_file is None: + train_list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME) + return generator.train_reader(train_images_dir, train_list_file, + batch_size) + + +def test(batch_size=1, test_images_dir=None, test_list_file=None): + generator = DataGenerator() + if test_images_dir is None: + data_dir = download_data() + test_images_dir = path.join(data_dir, TEST_DATA_DIR_NAME) + if test_list_file is None: + test_list_file = path.join(data_dir, TEST_LIST_FILE_NAME) + return paddle.batch( + generator.test_reader(test_images_dir, test_list_file), batch_size) + + +def inference(infer_images_dir=None, infer_list_file=None): + generator = DataGenerator() + return paddle.batch( + generator.infer_reader(infer_images_dir, infer_list_file), 1) + + +def download_data(): + '''Download train and test data. + ''' + tar_file = paddle.dataset.common.download( + DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME) + data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME) + if not path.isdir(data_dir): + t = tarfile.open(tar_file, "r:gz") + t.extractall(path=path.dirname(tar_file)) + t.close() + return data_dir diff --git a/__ocr_recognition/ctc_train.py b/__ocr_recognition/ctc_train.py new file mode 100644 index 00000000..43fcd13e --- /dev/null +++ b/__ocr_recognition/ctc_train.py @@ -0,0 +1,138 @@ +"""Trainer for OCR CTC model.""" +import paddle.fluid as fluid +from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data +from crnn_ctc_model import ctc_train_net +import ctc_reader +import argparse +import functools +import sys +import time +import os +import numpy as np +from continuous_evaluation import train_avg_loss_kpi, train_seq_err_kpi + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 32, "Minibatch size.") +add_arg('pass_num', int, 100, "Number of training epochs.") +add_arg('log_period', int, 100, "Log period.") +add_arg('iterations', int, 10000, "the total iterations.") +add_arg('save_model_period', int, 15000, "Save model period. '-1' means never saving the model.") +add_arg('eval_period', int, 15000, "Evaluate period. '-1' means never evaluating the model.") +add_arg('save_model_dir', str, "./models", "The directory the model to be saved to.") +add_arg('init_model', str, None, "The init model file of directory.") +add_arg('learning_rate', float, 1.0e-3, "Learning rate.") +add_arg('l2', float, 0.0004, "L2 regularizer.") +add_arg('momentum', float, 0.9, "Momentum.") +add_arg('rnn_hidden_size', int, 200, "Hidden size of rnn layers.") +add_arg('use_gpu', bool, True, "Whether use GPU to train.") +add_arg('min_average_window',int, 10000, "Min average window.") +add_arg('max_average_window',int, 15625, "Max average window. It is proposed to be set as the number of minibatch in a pass.") +add_arg('average_window', float, 0.15, "Average window.") +add_arg('parallel', bool, False, "Whether use parallel training.") +add_arg('train_images', str, None, "The directory of training images." + "None means using the default training images of reader.") +add_arg('train_list', str, None, "The list file of training images." + "None means using the default train_list file of reader.") +add_arg('test_images', str, None, "The directory of training images." + "None means using the default test images of reader.") +add_arg('test_list', str, None, "The list file of training images." + "None means using the default test_list file of reader.") +add_arg('num_classes', int, None, "The number of classes." + "None means using the default num_classes from reader.") +# yapf: enable + + +def train(args, data_reader=ctc_reader): + """OCR CTC training""" + num_classes = data_reader.num_classes( + ) if args.num_classes is None else args.num_classes + data_shape = data_reader.data_shape() + # define network + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int32', lod_level=1) + sum_cost, error_evaluator, inference_program, model_average = ctc_train_net( + images, label, args, num_classes) + + # data reader + train_reader = data_reader.train( + args.batch_size, + train_images_dir=args.train_images, + train_list_file=args.train_list) + test_reader = data_reader.test( + test_images_dir=args.test_images, test_list_file=args.test_list) + + # prepare environment + place = fluid.CPUPlace() + if args.use_gpu: + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + # load init model + if args.init_model is not None: + model_dir = args.init_model + model_file_name = None + if not os.path.isdir(args.init_model): + model_dir = os.path.dirname(args.init_model) + model_file_name = os.path.basename(args.init_model) + fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name) + print "Init model from: %s." % args.init_model + + for pass_id in range(args.pass_num): + error_evaluator.reset(exe) + batch_id = 1 + total_loss = 0.0 + total_seq_error = 0.0 + # train a pass + for data in train_reader(): + batch_loss, _, batch_seq_error = exe.run( + fluid.default_main_program(), + feed=get_feeder_data(data, place), + fetch_list=[sum_cost] + error_evaluator.metrics) + total_loss += batch_loss[0] + total_seq_error += batch_seq_error[0] + # training log + if batch_id % args.log_period == 0: + print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % ( + time.time(), pass_id, batch_id, + total_loss / (batch_id * args.batch_size), + total_seq_error / (batch_id * args.batch_size)) + sys.stdout.flush() + if batch_id == args.iterations - 1: + avg_seq_err = batch_seq_error[0] / args.batch_size + avg_loss = batch_loss[0] / args.batch_size + train_avg_loss_kpi.add_record( + np.array( + avg_loss, dtype='float32')) + train_seq_err_kpi.add_record( + np.array( + avg_seq_err, dtype='float32')) + break + # evaluate + if batch_id % args.eval_period == 0: + with model_average.apply(exe): + error_evaluator.reset(exe) + for data in test_reader(): + exe.run(inference_program, + feed=get_feeder_data(data, place)) + _, test_seq_error = error_evaluator.eval(exe) + + print "\nTime: %s; Pass[%d]-batch[%d]; Test seq error: %s.\n" % ( + time.time(), pass_id, batch_id, str(test_seq_error[0])) + + batch_id += 1 + train_avg_loss_kpi.persist() + train_seq_err_kpi.persist() + + +def main(): + args = parser.parse_args() + print_arguments(args) + train(args, data_reader=ctc_reader) + + +if __name__ == "__main__": + main() diff --git a/__ocr_recognition/eval.py b/__ocr_recognition/eval.py new file mode 100644 index 00000000..1c33ff36 --- /dev/null +++ b/__ocr_recognition/eval.py @@ -0,0 +1,71 @@ +import paddle.v2 as paddle +import paddle.fluid as fluid +from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data +from crnn_ctc_model import ctc_infer +from crnn_ctc_model import ctc_eval +import ctc_reader +import argparse +import functools +import os + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('model_path', str, None, "The model path to be used for inference.") +add_arg('input_images_dir', str, None, "The directory of images.") +add_arg('input_images_list', str, None, "The list file of images.") +add_arg('use_gpu', bool, True, "Whether use GPU to eval.") +# yapf: enable + + +def evaluate(args, eval=ctc_eval, data_reader=ctc_reader): + """OCR inference""" + num_classes = data_reader.num_classes() + data_shape = data_reader.data_shape() + # define network + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int32', lod_level=1) + evaluator, cost = eval(images, label, num_classes) + + # data reader + test_reader = data_reader.test( + test_images_dir=args.input_images_dir, + test_list_file=args.input_images_list) + + # prepare environment + place = fluid.CPUPlace() + if use_gpu: + place = fluid.CUDAPlace(0) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + # load init model + model_dir = args.model_path + model_file_name = None + if not os.path.isdir(args.model_path): + model_dir = os.path.dirname(args.model_path) + model_file_name = os.path.basename(args.model_path) + fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name) + print "Init model from: %s." % args.model_path + + evaluator.reset(exe) + count = 0 + for data in test_reader(): + count += 1 + exe.run(fluid.default_main_program(), + feed=get_feeder_data(data, place)) + avg_distance, avg_seq_error = evaluator.eval(exe) + print "Read %d samples; avg_distance: %s; avg_seq_error: %s" % ( + count, avg_distance, avg_seq_error) + + +def main(): + args = parser.parse_args() + print_arguments(args) + evaluate(args, data_reader=ctc_reader) + + +if __name__ == "__main__": + main() diff --git a/__ocr_recognition/images/demo.jpg b/__ocr_recognition/images/demo.jpg new file mode 100644 index 00000000..be5aee50 Binary files /dev/null and b/__ocr_recognition/images/demo.jpg differ diff --git a/__ocr_recognition/images/train.jpg b/__ocr_recognition/images/train.jpg new file mode 100644 index 00000000..3d691f1c Binary files /dev/null and b/__ocr_recognition/images/train.jpg differ diff --git a/__ocr_recognition/inference.py b/__ocr_recognition/inference.py new file mode 100644 index 00000000..04175bb1 --- /dev/null +++ b/__ocr_recognition/inference.py @@ -0,0 +1,65 @@ +import paddle.v2 as paddle +import paddle.fluid as fluid +from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data +from crnn_ctc_model import ctc_infer +import numpy as np +import ctc_reader +import argparse +import functools +import os + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('model_path', str, None, "The model path to be used for inference.") +add_arg('input_images_dir', str, None, "The directory of images.") +add_arg('input_images_list', str, None, "The list file of images.") +add_arg('use_gpu', bool, True, "Whether use GPU to infer.") +# yapf: enable + + +def inference(args, infer=ctc_infer, data_reader=ctc_reader): + """OCR inference""" + num_classes = data_reader.num_classes() + data_shape = data_reader.data_shape() + # define network + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + sequence = infer(images, num_classes) + # data reader + infer_reader = data_reader.inference( + infer_images_dir=args.input_images_dir, + infer_list_file=args.input_images_list) + # prepare environment + place = fluid.CPUPlace() + if use_gpu: + place = fluid.CUDAPlace(0) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + # load init model + model_dir = args.model_path + model_file_name = None + if not os.path.isdir(args.model_path): + model_dir = os.path.dirname(args.model_path) + model_file_name = os.path.basename(args.model_path) + fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name) + print "Init model from: %s." % args.model_path + + for data in infer_reader(): + result = exe.run(fluid.default_main_program(), + feed=get_feeder_data( + data, place, need_label=False), + fetch_list=[sequence], + return_numpy=False) + print "result: %s" % (np.array(result[0]).flatten(), ) + + +def main(): + args = parser.parse_args() + print_arguments(args) + inference(args, data_reader=ctc_reader) + + +if __name__ == "__main__": + main() diff --git a/__ocr_recognition/latest_kpis/train_avg_loss_factor.txt b/__ocr_recognition/latest_kpis/train_avg_loss_factor.txt new file mode 100644 index 00000000..e1a834b0 --- /dev/null +++ b/__ocr_recognition/latest_kpis/train_avg_loss_factor.txt @@ -0,0 +1 @@ +[8196.62353515625] diff --git a/__ocr_recognition/latest_kpis/train_seq_err_factor.txt b/__ocr_recognition/latest_kpis/train_seq_err_factor.txt new file mode 100644 index 00000000..07787fb2 --- /dev/null +++ b/__ocr_recognition/latest_kpis/train_seq_err_factor.txt @@ -0,0 +1 @@ +[828.0] diff --git a/__ocr_recognition/run.xsh b/__ocr_recognition/run.xsh new file mode 100755 index 00000000..ebbe41c7 --- /dev/null +++ b/__ocr_recognition/run.xsh @@ -0,0 +1,8 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +cudaid=${ocr_recognition_cudaid:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.9 python ctc_train.py --use_gpu=True --batch_size=128 --pass_num=1 --iterations=3000 diff --git a/__ocr_recognition/utility.py b/__ocr_recognition/utility.py new file mode 100644 index 00000000..67a5bfa0 --- /dev/null +++ b/__ocr_recognition/utility.py @@ -0,0 +1,90 @@ +"""Contains common utility functions.""" +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import distutils.util +import numpy as np +from paddle.fluid import core + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int32") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = core.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def get_feeder_data(data, place, need_label=True): + pixel_tensor = core.LoDTensor() + pixel_data = None + pixel_data = np.concatenate( + map(lambda x: x[0][np.newaxis, :], data), axis=0).astype("float32") + pixel_tensor.set(pixel_data, place) + label_tensor = to_lodtensor(map(lambda x: x[1], data), place) + if need_label: + return {"pixel": pixel_tensor, "label": label_tensor} + else: + return {"pixel": pixel_tensor} diff --git a/resnet30/__init__.py b/__resnet30/__init__.py similarity index 100% rename from resnet30/__init__.py rename to __resnet30/__init__.py diff --git a/__resnet30/continuous_evaluation.py b/__resnet30/continuous_evaluation.py new file mode 100644 index 00000000..0ac5f0b4 --- /dev/null +++ b/__resnet30/continuous_evaluation.py @@ -0,0 +1,19 @@ +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, AccKpi, DurationKpi + +train_cost_kpi = CostKpi('train_cost', 0.05, 0, actived=True) +train_acc_kpi = AccKpi('train_acc', 0.02, 0, actived=True) +test_acc_kpi = AccKpi('test_acc', 0.05, 0, actived=True) +train_speed_kpi = AccKpi('train_speed', 0.01, 0, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.02, 0, actived=True) + + +tracking_kpis = [ + train_cost_kpi, + train_acc_kpi, + test_acc_kpi, + train_speed_kpi, + train_duration_kpi, +] diff --git a/resnet30/history/train_cost_factor.txt b/__resnet30/history/train_cost_factor.txt similarity index 79% rename from resnet30/history/train_cost_factor.txt rename to __resnet30/history/train_cost_factor.txt index cf262792..c46c47b0 100644 --- a/resnet30/history/train_cost_factor.txt +++ b/__resnet30/history/train_cost_factor.txt @@ -2,4 +2,4 @@ [[2.744691848754883]] [[2.5916006565093994]] [[2.459857225418091]] -[[2.3514037132263184]] \ No newline at end of file +[[2.3514037132263184]] diff --git a/resnet30/history/train_duration_factor.txt b/__resnet30/history/train_duration_factor.txt similarity index 80% rename from resnet30/history/train_duration_factor.txt rename to __resnet30/history/train_duration_factor.txt index f5c4f294..ebc0b54b 100644 --- a/resnet30/history/train_duration_factor.txt +++ b/__resnet30/history/train_duration_factor.txt @@ -2,4 +2,4 @@ [10.211545944213867] [10.223276853561401] [10.213245153427124] -[10.241420984268188] \ No newline at end of file +[10.241420984268188] diff --git a/__resnet30/latest_kpis/test_acc_factor.txt b/__resnet30/latest_kpis/test_acc_factor.txt new file mode 100644 index 00000000..d7046f5b --- /dev/null +++ b/__resnet30/latest_kpis/test_acc_factor.txt @@ -0,0 +1 @@ +[0.459300000667572] diff --git a/__resnet30/latest_kpis/train_acc_factor.txt b/__resnet30/latest_kpis/train_acc_factor.txt new file mode 100644 index 00000000..bab1e137 --- /dev/null +++ b/__resnet30/latest_kpis/train_acc_factor.txt @@ -0,0 +1 @@ +[0.56150390625] diff --git a/__resnet30/latest_kpis/train_cost_factor.txt b/__resnet30/latest_kpis/train_cost_factor.txt new file mode 100644 index 00000000..c1286410 --- /dev/null +++ b/__resnet30/latest_kpis/train_cost_factor.txt @@ -0,0 +1,10 @@ +[[3.121091365814209]] +[[2.9679136276245117]] +[[2.664355516433716]] +[[2.5711519718170166]] +[[2.484081745147705]] +[[2.44614839553833]] +[[2.416034460067749]] +[[2.4315545558929443]] +[[2.4579968452453613]] +[[2.449829578399658]] diff --git a/__resnet30/latest_kpis/train_duration_factor.txt b/__resnet30/latest_kpis/train_duration_factor.txt new file mode 100644 index 00000000..6007f902 --- /dev/null +++ b/__resnet30/latest_kpis/train_duration_factor.txt @@ -0,0 +1,10 @@ +[22.25151491165161] +[21.59505009651184] +[21.59479784965515] +[21.565481901168823] +[21.499217987060547] +[21.321773052215576] +[21.280965089797974] +[21.29200315475464] +[21.28358292579651] +[21.292808055877686] diff --git a/__resnet30/latest_kpis/train_speed_factor.txt b/__resnet30/latest_kpis/train_speed_factor.txt new file mode 100644 index 00000000..6c50ba81 --- /dev/null +++ b/__resnet30/latest_kpis/train_speed_factor.txt @@ -0,0 +1 @@ +[55.21354293823242] diff --git a/resnet30/model.py b/__resnet30/model.py similarity index 57% rename from resnet30/model.py rename to __resnet30/model.py index 0140ab72..710d70cb 100644 --- a/resnet30/model.py +++ b/__resnet30/model.py @@ -11,9 +11,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core - -from continuous_evaluation import (train_cost_kpi, train_duration_kpi, - tracking_kpis) +from continuous_evaluation import * logger = logging.getLogger(__name__) @@ -85,86 +83,123 @@ def train(batch_size, device, pass_num, iterations): input = fluid.layers.data(name='data', shape=dshape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') + # Train program predict = resnet_cifar10(input, class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) - optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) - opts = optimizer.minimize(avg_cost) - # accuracy = fluid.evaluator.Evaluator(input=predict, label=label) + + # Evaluator + #accuracy = fluid.evaluator.Evaluator(input=predict, label=label) + + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + accuracy = fluid.average.WeightedAverage() # inference program inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): # test_target = accuracy.metrics + accuracy.states - test_target = [predict, avg_cost] - inference_program = fluid.io.get_inference_program(test_target) + target_vars=[batch_acc, batch_size_tensor] + inference_program = fluid.io.get_inference_program(target_vars) + # Optimization + optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + opts = optimizer.minimize(avg_cost) fluid.memory_optimize(fluid.default_main_program()) train_reader = paddle.batch( - paddle.dataset.cifar.train10(), - batch_size=batch_size) + paddle.dataset.cifar.train10(), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.cifar.test10(), batch_size=batch_size) + # Initialize executor + place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + # Parameter initialization + exe.run(fluid.default_startup_program()) + def test(exe): - # accuracy.reset(exe) + test_accuracy = fluid.average.WeightedAverage() for batch_id, data in enumerate(test_reader()): img_data = np.array(map(lambda x: x[0].reshape(dshape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) - # print('image_data', img_data) - # print('y_data', y_data) - - predict_, avg_cost_ = exe.run( - inference_program, - feed={ - "data": img_data, - "label": y_data - }, - fetch_list=[predict, avg_cost]) - return avg_cost + acc, weight = exe.run(inference_program, + feed={"data": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_accuracy.add(value=acc, weight=weight) - # return accuracy.eval(exe) - - place = core.CPUPlace() if device == 'CPU' else core.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) + return test_accuracy.eval() - for pass_id in range(1): - logger.warning('Pass {}'.format(pass_id)) - # accuracy.reset(exe) + im_num = 0 + total_train_time = 0.0 + for pass_id in range(args.pass_num): iter = 0 + every_pass_loss = [] + accuracy.reset() + pass_duration = 0.0 for batch_id, data in enumerate(train_reader()): logger.warning('Batch {}'.format(batch_id)) batch_start = time.time() if iter == iterations: break - image = np.array(map(lambda x: x[0].reshape(dshape), - data)).astype('float32') + image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype( + 'float32') label = np.array(map(lambda x: x[1], data)).astype('int64') label = label.reshape([-1, 1]) - avg_cost_ = exe.run( + + loss, acc, weight = exe.run( fluid.default_main_program(), - feed={ - 'data': image, - 'label': label - }, - fetch_list=[avg_cost]) + feed={'data': image, + 'label': label}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor]) + batch_end = time.time() - print('avg_cost', np.array(avg_cost_, dtype='float32')) - train_cost_kpi.add_record(np.array(avg_cost_, dtype='float32')) - train_duration_kpi.add_record(batch_end - batch_start) + every_pass_loss.append(loss) + accuracy.add(value=acc, weight=weight) + + + if iter >= args.skip_batch_num or pass_id != 0: + batch_duration = time.time() - batch_start + pass_duration += batch_duration + im_num += label.shape[0] iter += 1 - # test_start = time.time() - # test(exe) - # test_end = time.time() - # valid_tracker.add(test_end - test_start, pass_test_acc) + print( + "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, iter, loss, acc)) + pass_train_acc = accuracy.eval() + pass_test_acc = test(exe) + + total_train_time += pass_duration + pass_train_loss = np.mean(every_pass_loss) + print( + "Pass:%d, Loss:%f, Train Accuray:%f, Test Accuray:%f, Handle Images Duration: %f\n" + % (pass_id, pass_train_loss, pass_train_acc, + pass_test_acc, pass_duration)) + if pass_id == args.pass_num - 1: + train_cost_kpi.add_record(np.array(pass_train_loss, dtype='float32')) + train_cost_kpi.persist() + train_acc_kpi.add_record(np.array(pass_train_acc, dtype='float32')) + train_acc_kpi.persist() + test_acc_kpi.add_record(np.array(pass_test_acc, dtype='float32')) + test_acc_kpi.persist() + train_duration_kpi.add_record(batch_end - batch_start) + train_duration_kpi.persist() + + if total_train_time > 0.0: + examples_per_sec = im_num / total_train_time + sec_per_batch = total_train_time / \ + (iter * args.pass_num - args.skip_batch_num) + train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32')) + train_speed_kpi.persist() def parse_args(): @@ -172,6 +207,14 @@ def parse_args(): parser.add_argument('--batch_size', type=int) parser.add_argument('--device', type=str, choices=('CPU', 'GPU')) parser.add_argument('--iters', type=int) + parser.add_argument( + '--pass_num', type=int, default=3, help='The number of passes.') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) args = parser.parse_args() return args diff --git a/resnet30/run.xsh b/__resnet30/run.xsh similarity index 81% rename from resnet30/run.xsh rename to __resnet30/run.xsh index 11393550..33644395 100755 --- a/resnet30/run.xsh +++ b/__resnet30/run.xsh @@ -9,4 +9,4 @@ import sys model_file = 'model.py' -python @(model_file) --batch_size 1000 --iters 10 --device CPU +python @(model_file) --batch_size 128 --pass_num 5 --iters 80 --device CPU diff --git a/image_classification/continuous_evaluation.py b/image_classification/continuous_evaluation.py new file mode 100644 index 00000000..21f3ea06 --- /dev/null +++ b/image_classification/continuous_evaluation.py @@ -0,0 +1,27 @@ +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +train_acc_top1_kpi = AccKpi('train_acc_top1_kpi', 0.05, 0, + desc='TOP1 ACC') +train_acc_top5_kpi = AccKpi('train_acc_top5_kpi', 0.05, 0, + actived=True, + desc='TOP5 ACC') +train_cost_kpi = CostKpi('train_cost_kpi', 0.05, 0, + actived=True, + desc='train cost') +train_speed_kpi = AccKpi('train_speed_kpi', 0.05, 0, + actived=True, + unit_repr='images/s', + desc='train speed in one GPU card') +four_card_train_speed_kpi = AccKpi('four_card_train_speed_kpi', 0.05, 0, + actived=True, + unit_repr='images/s', + desc='train speed in four GPU card') + +tracking_kpis = [train_acc_top1_kpi, + train_acc_top5_kpi, + train_cost_kpi, + train_speed_kpi, + four_card_train_speed_kpi] diff --git a/image_classification/latest_kpis/four_card_train_speed_kpi_factor.txt b/image_classification/latest_kpis/four_card_train_speed_kpi_factor.txt new file mode 100644 index 00000000..f2ce4eee --- /dev/null +++ b/image_classification/latest_kpis/four_card_train_speed_kpi_factor.txt @@ -0,0 +1 @@ +[174.80782203734947] \ No newline at end of file diff --git a/image_classification/latest_kpis/train_acc_top1_kpi_factor.txt b/image_classification/latest_kpis/train_acc_top1_kpi_factor.txt new file mode 100644 index 00000000..351fea16 --- /dev/null +++ b/image_classification/latest_kpis/train_acc_top1_kpi_factor.txt @@ -0,0 +1 @@ +[0.3767074942588806] \ No newline at end of file diff --git a/image_classification/latest_kpis/train_acc_top5_kpi_factor.txt b/image_classification/latest_kpis/train_acc_top5_kpi_factor.txt new file mode 100644 index 00000000..9e1b075c --- /dev/null +++ b/image_classification/latest_kpis/train_acc_top5_kpi_factor.txt @@ -0,0 +1 @@ +[0.5719688820838928] diff --git a/image_classification/latest_kpis/train_cost_kpi_factor.txt b/image_classification/latest_kpis/train_cost_kpi_factor.txt new file mode 100644 index 00000000..c896d6ff --- /dev/null +++ b/image_classification/latest_kpis/train_cost_kpi_factor.txt @@ -0,0 +1 @@ +[2.875904941558838] diff --git a/image_classification/latest_kpis/train_speed_kpi_factor.txt b/image_classification/latest_kpis/train_speed_kpi_factor.txt new file mode 100644 index 00000000..6a664608 --- /dev/null +++ b/image_classification/latest_kpis/train_speed_kpi_factor.txt @@ -0,0 +1 @@ +[101.29667191639184] \ No newline at end of file diff --git a/image_classification/mobilenet.py b/image_classification/mobilenet.py new file mode 100644 index 00000000..0a8197f1 --- /dev/null +++ b/image_classification/mobilenet.py @@ -0,0 +1,153 @@ +import os + +import paddle +import paddle.fluid as fluid +from paddle.fluid.initializer import MSRA +from paddle.fluid.param_attr import ParamAttr + +parameter_attr = ParamAttr(initializer=MSRA()) + + +def conv_bn_layer(input, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='relu', + use_cudnn=True): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + act=None, + use_cudnn=use_cudnn, + param_attr=parameter_attr, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act) + + +def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride, + scale): + depthwise_conv = conv_bn_layer( + input=input, + filter_size=3, + num_filters=int(num_filters1 * scale), + stride=stride, + padding=1, + num_groups=int(num_groups * scale), + use_cudnn=False) + + pointwise_conv = conv_bn_layer( + input=depthwise_conv, + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + return pointwise_conv + + +def mobile_net(img, class_dim, scale=1.0): + + # conv1: 112x112 + tmp = conv_bn_layer( + img, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + # 56x56 + tmp = depthwise_separable( + tmp, + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + + tmp = depthwise_separable( + tmp, + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=2, + scale=scale) + + # 28x28 + tmp = depthwise_separable( + tmp, + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + + tmp = depthwise_separable( + tmp, + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=2, + scale=scale) + + # 14x14 + tmp = depthwise_separable( + tmp, + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + + tmp = depthwise_separable( + tmp, + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=2, + scale=scale) + + # 14x14 + for i in range(5): + tmp = depthwise_separable( + tmp, + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + scale=scale) + # 7x7 + tmp = depthwise_separable( + tmp, + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=2, + scale=scale) + + tmp = depthwise_separable( + tmp, + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=1, + scale=scale) + + tmp = fluid.layers.pool2d( + input=tmp, + pool_size=0, + pool_stride=1, + pool_type='avg', + global_pooling=True) + + tmp = fluid.layers.fc(input=tmp, + size=class_dim, + act='softmax', + param_attr=parameter_attr) + return tmp diff --git a/image_classification/reader.py b/image_classification/reader.py new file mode 100644 index 00000000..4061d1d3 --- /dev/null +++ b/image_classification/reader.py @@ -0,0 +1,164 @@ +import os +import math +import random +import functools +import numpy as np +import paddle +from PIL import Image, ImageEnhance + +random.seed(0) + +DATA_DIM = 224 + +THREAD = 8 +BUF_SIZE = 1024 + +DATA_DIR = 'ILSVRC2012' +TRAIN_LIST = 'ILSVRC2012/train_list.txt' +TEST_LIST = 'ILSVRC2012/test_list.txt' + +img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) +img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) + + +def resize_short(img, target_size): + percent = float(target_size) / min(img.size[0], img.size[1]) + resized_width = int(round(img.size[0] * percent)) + resized_height = int(round(img.size[1] * percent)) + img = img.resize((resized_width, resized_height), Image.LANCZOS) + return img + + +def crop_image(img, target_size, center): + width, height = img.size + size = target_size + if center == True: + w_start = (width - size) / 2 + h_start = (height - size) / 2 + else: + w_start = random.randint(0, width - size) + h_start = random.randint(0, height - size) + w_end = w_start + size + h_end = h_start + size + img = img.crop((w_start, h_start, w_end, h_end)) + return img + + +def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]): + aspect_ratio = math.sqrt(random.uniform(*ratio)) + w = 1. * aspect_ratio + h = 1. / aspect_ratio + + bound = min((float(img.size[0]) / img.size[1]) / (w**2), + (float(img.size[1]) / img.size[0]) / (h**2)) + scale_max = min(scale[1], bound) + scale_min = min(scale[0], bound) + + target_area = img.size[0] * img.size[1] * random.uniform(scale_min, + scale_max) + target_size = math.sqrt(target_area) + w = int(target_size * w) + h = int(target_size * h) + + i = random.randint(0, img.size[0] - w) + j = random.randint(0, img.size[1] - h) + + img = img.crop((i, j, i + w, j + h)) + img = img.resize((size, size), Image.LANCZOS) + return img + + +def rotate_image(img): + angle = random.randint(-10, 10) + img = img.rotate(angle) + return img + + +def distort_color(img): + def random_brightness(img, lower=0.5, upper=1.5): + e = random.uniform(lower, upper) + return ImageEnhance.Brightness(img).enhance(e) + + def random_contrast(img, lower=0.5, upper=1.5): + e = random.uniform(lower, upper) + return ImageEnhance.Contrast(img).enhance(e) + + def random_color(img, lower=0.5, upper=1.5): + e = random.uniform(lower, upper) + return ImageEnhance.Color(img).enhance(e) + + ops = [random_brightness, random_contrast, random_color] + random.shuffle(ops) + + img = ops[0](img) + img = ops[1](img) + img = ops[2](img) + + return img + + +def process_image(sample, mode, color_jitter, rotate): + img_path = sample[0] + + img = Image.open(img_path) + if mode == 'train': + if rotate: img = rotate_image(img) + img = random_crop(img, DATA_DIM) + else: + img = resize_short(img, DATA_DIM) + img = crop_image(img, target_size=DATA_DIM, center=True) + if mode == 'train': + if color_jitter: + img = distort_color(img) + if random.randint(0, 1) == 1: + img = img.transpose(Image.FLIP_LEFT_RIGHT) + + if img.mode != 'RGB': + img = img.convert('RGB') + + img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255 + img -= img_mean + img /= img_std + + if mode == 'train' or mode == 'test': + return img, sample[1] + elif mode == 'infer': + return [img] + + +def _reader_creator(file_list, + mode, + shuffle=False, + color_jitter=False, + rotate=False): + def reader(): + with open(file_list) as flist: + lines = [line.strip() for line in flist] + if shuffle: + random.shuffle(lines) + for line in lines: + if mode == 'train' or mode == 'test': + img_path, label = line.split() + img_path = os.path.join(DATA_DIR, img_path) + yield img_path, int(label) + elif mode == 'infer': + img_path = os.path.join(DATA_DIR, line) + yield [img_path] + + mapper = functools.partial( + process_image, mode=mode, color_jitter=color_jitter, rotate=rotate) + + return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE) + + +def train(file_list=TRAIN_LIST): + return _reader_creator( + file_list, 'train', shuffle=True, color_jitter=False, rotate=False) + + +def test(file_list=TEST_LIST): + return _reader_creator(file_list, 'test', shuffle=False) + + +def infer(file_list): + return _reader_creator(file_list, 'infer', shuffle=False) diff --git a/image_classification/run.xsh b/image_classification/run.xsh new file mode 100755 index 00000000..f003feb2 --- /dev/null +++ b/image_classification/run.xsh @@ -0,0 +1,14 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +cudaid=${multi_se_resnext_cudaid:=0,1,2,3} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py --batch_size=64 +mv train_speed_kpi_factor.txt four_card_train_speed_kpi_factor.txt + +cudaid=${se_resnext_cudaid:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py --batch_size=64 diff --git a/image_classification/se_resnext.py b/image_classification/se_resnext.py new file mode 100644 index 00000000..ad533c75 --- /dev/null +++ b/image_classification/se_resnext.py @@ -0,0 +1,138 @@ +import os +import numpy as np +import time +import sys +import paddle +import paddle.fluid as fluid +import reader +import paddle.fluid.layers.control_flow as control_flow +import paddle.fluid.layers.nn as nn +import paddle.fluid.layers.tensor as tensor +import math + + +def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) / 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act) + + +def squeeze_excitation(input, num_channels, reduction_ratio): + pool = fluid.layers.pool2d( + input=input, pool_size=0, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + squeeze = fluid.layers.fc(input=pool, + size=num_channels / reduction_ratio, + act='relu', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, + stdv))) + stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) + excitation = fluid.layers.fc(input=squeeze, + size=num_channels, + act='sigmoid', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform( + -stdv, stdv))) + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + filter_size = 1 + return conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + +def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): + conv0 = conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + groups=cardinality, + act='relu') + conv2 = conv_bn_layer( + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) + scale = squeeze_excitation( + input=conv2, + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + short = shortcut(input, num_filters * 2, stride) + + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') + + +def SE_ResNeXt(input, class_dim, infer=False, layers=50): + supported_layers = [50, 152] + if layers not in supported_layers: + print("supported layers are", supported_layers, \ + "but input layer is ", layers) + exit() + if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + + conv = conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 152: + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + + conv = conv_bn_layer( + input=input, num_filters=64, filter_size=3, stride=2, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=64, filter_size=3, stride=1, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=128, filter_size=3, stride=1, act='relu') + conv = fluid.layers.pool2d( + input=conv, pool_size=3, pool_stride=2, pool_padding=1, \ + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + conv = bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + if not infer: + drop = fluid.layers.dropout(x=pool, dropout_prob=0.5) + else: + drop = pool + stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) + out = fluid.layers.fc(input=drop, + size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, + stdv))) + return out diff --git a/image_classification/train.py b/image_classification/train.py new file mode 100644 index 00000000..c7a4fb43 --- /dev/null +++ b/image_classification/train.py @@ -0,0 +1,407 @@ +import os +import numpy as np +import time +import sys +import paddle +import paddle.fluid as fluid +from se_resnext import SE_ResNeXt +from mobilenet import mobile_net +import paddle.dataset.flowers as flowers +import reader + +import argparse +import functools +import paddle.fluid.layers.ops as ops +from utility import add_arguments, print_arguments +from paddle.fluid.initializer import init_on_cpu +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter +import math + +from continuous_evaluation import (train_acc_top1_kpi, train_acc_top5_kpi, + train_cost_kpi, train_speed_kpi) +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +add_arg('batch_size', int, 256, "Minibatch size.") +add_arg('num_layers', int, 50, "How many layers for SE-ResNeXt model.") +add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.") +add_arg('parallel_exe', bool, True, "Whether to use ParallelExecutor to train or not.") +add_arg('init_model', str, None, "Whether to use initialized model.") +add_arg('pretrained_model', str, None, "Whether to use pretrained model.") +add_arg('lr_strategy', str, "cosine_decay", + "Set the learning rate decay strategy.") +add_arg('model', str, "se_resnext", "Set the network to use.") + + +def cosine_decay(learning_rate, step_each_epoch, epochs=120): + """Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + global_step = _decay_step_counter() + + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * \ + (ops.cos(epoch * (math.pi / epochs)) + 1)/2 + return decayed_lr + + +def train_parallel_do(args, + learning_rate, + batch_size, + num_passes, + init_model=None, + pretrained_model=None, + model_save_dir='model', + parallel=True, + use_nccl=True, + lr_strategy=None, + layers=50): + class_dim = 1000 + image_shape = [3, 224, 224] + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) + + with pd.do(): + image_ = pd.read_input(image) + label_ = pd.read_input(label) + if args.model is 'se_resnext': + out = SE_ResNeXt( + input=image_, class_dim=class_dim, layers=layers) + else: + out = mobile_net(img=image_, class_dim=class_dim) + + cost = fluid.layers.cross_entropy(input=out, label=label_) + avg_cost = fluid.layers.mean(x=cost) + acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5) + pd.write_output(avg_cost) + pd.write_output(acc_top1) + pd.write_output(acc_top5) + + avg_cost, acc_top1, acc_top5 = pd() + avg_cost = fluid.layers.mean(x=avg_cost) + acc_top1 = fluid.layers.mean(x=acc_top1) + acc_top5 = fluid.layers.mean(x=acc_top5) + else: + if args.model is 'se_resnext': + out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) + else: + out = mobile_net(img=image, class_dim=class_dim) + + cost = fluid.layers.cross_entropy(input=out, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + inference_program = fluid.default_main_program().clone(for_test=True) + + if "piecewise_decay" in lr_strategy: + bd = lr_strategy["piecewise_decay"]["bd"] + lr = lr_strategy["piecewise_decay"]["lr"] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + elif "cosine_decay" in lr_strategy: + step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"] + epochs = lr_strategy["cosine_decay"]["epochs"] + optimizer = fluid.optimizer.Momentum( + learning_rate=cosine_decay( + learning_rate=learning_rate, + step_each_epoch=step_each_epoch, + epochs=epochs), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + else: + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + opts = optimizer.minimize(avg_cost) + if args.with_mem_opt: + fluid.memory_optimize(fluid.default_main_program()) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if init_model is not None: + fluid.io.load_persistables(exe, init_model) + + if pretrained_model: + + def if_exist(var): + return os.path.exists(os.path.join(pretrained_model, var.name)) + + fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) + + train_reader = paddle.batch(reader.train(), batch_size=batch_size) + test_reader = paddle.batch(reader.test(), batch_size=batch_size) + feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) + + for pass_id in range(num_passes): + train_info = [[], [], []] + test_info = [[], [], []] + for batch_id, data in enumerate(train_reader()): + t1 = time.time() + loss, acc1, acc5 = exe.run( + fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost, acc_top1, acc_top5]) + t2 = time.time() + period = t2 - t1 + train_info[0].append(loss[0]) + train_info[1].append(acc1[0]) + train_info[2].append(acc5[0]) + if batch_id % 10 == 0: + print("Pass {0}, trainbatch {1}, loss {2}, \ + acc1 {3}, acc5 {4} time {5}" + .format(pass_id, \ + batch_id, loss[0], acc1[0], acc5[0], \ + "%2.2f sec" % period)) + sys.stdout.flush() + + train_loss = np.array(train_info[0]).mean() + train_acc1 = np.array(train_info[1]).mean() + train_acc5 = np.array(train_info[2]).mean() + for data in test_reader(): + t1 = time.time() + loss, acc1, acc5 = exe.run( + inference_program, + feed=feeder.feed(data), + fetch_list=[avg_cost, acc_top1, acc_top5]) + t2 = time.time() + period = t2 - t1 + test_info[0].append(loss[0]) + test_info[1].append(acc1[0]) + test_info[2].append(acc5[0]) + if batch_id % 10 == 0: + print("Pass {0},testbatch {1},loss {2}, \ + acc1 {3},acc5 {4},time {5}" + .format(pass_id, \ + batch_id, loss[0], acc1[0], acc5[0], \ + "%2.2f sec" % period)) + sys.stdout.flush() + + test_loss = np.array(test_info[0]).mean() + test_acc1 = np.array(test_info[1]).mean() + test_acc5 = np.array(test_info[2]).mean() + + print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ + test_loss {4}, test_acc1 {5}, test_acc5 {6}" + .format(pass_id, \ + train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ + test_acc5)) + sys.stdout.flush() + + model_path = os.path.join(model_save_dir + '/' + args.model, + str(pass_id)) + if not os.path.isdir(model_path): + os.makedirs(model_path) + fluid.io.save_persistables(exe, model_path) + + +def train_parallel_exe(args, + learning_rate, + batch_size, + num_passes, + init_model=None, + pretrained_model=None, + model_save_dir='model', + parallel=True, + use_nccl=True, + lr_strategy=None, + layers=50): + class_dim = 1000 + image_shape = [3, 224, 224] + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + if args.model is 'se_resnext': + out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) + else: + out = mobile_net(img=image, class_dim=class_dim) + + cost = fluid.layers.cross_entropy(input=out, label=label) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + avg_cost = fluid.layers.mean(x=cost) + + test_program = fluid.default_main_program().clone(for_test=True) + + if "piecewise_decay" in lr_strategy: + bd = lr_strategy["piecewise_decay"]["bd"] + lr = lr_strategy["piecewise_decay"]["lr"] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + elif "cosine_decay" in lr_strategy: + step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"] + epochs = lr_strategy["cosine_decay"]["epochs"] + optimizer = fluid.optimizer.Momentum( + learning_rate=cosine_decay( + learning_rate=learning_rate, + step_each_epoch=step_each_epoch, + epochs=epochs), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + else: + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + opts = optimizer.minimize(avg_cost) + + if args.with_mem_opt: + fluid.memory_optimize(fluid.default_main_program()) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + fluid.default_startup_program.random_seed = 1000 + exe.run(fluid.default_startup_program()) + + if init_model is not None: + fluid.io.load_persistables(exe, init_model) + + if pretrained_model: + + def if_exist(var): + return os.path.exists(os.path.join(pretrained_model, var.name)) + + fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) + + train_reader = paddle.batch(flowers.train(), batch_size=batch_size) + test_reader = paddle.batch(flowers.test(), batch_size=batch_size) + feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + test_exe = fluid.ParallelExecutor( + use_cuda=True, main_program=test_program, share_vars_from=train_exe) + + fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] + train_speed = [] + for pass_id in range(num_passes): + train_info = [[], [], []] + test_info = [[], [], []] + pass_time = 0 + pass_num = 0 + pass_speed = 0.0 + for batch_id, data in enumerate(train_reader()): + t1 = time.time() + loss, acc1, acc5 = train_exe.run(fetch_list, + feed=feeder.feed(data)) + t2 = time.time() + period = t2 - t1 + pass_time += period + pass_num += len(data) + loss = np.mean(np.array(loss)) + acc1 = np.mean(np.array(acc1)) + acc5 = np.mean(np.array(acc5)) + train_info[0].append(loss) + train_info[1].append(acc1) + train_info[2].append(acc5) + if batch_id % 10 == 0: + print("Pass {0}, trainbatch {1}, loss {2}, \ + acc1 {3}, acc5 {4} time {5}" + .format(pass_id, \ + batch_id, loss, acc1, acc5, \ + "%2.2f sec" % period)) + sys.stdout.flush() + + train_loss = np.array(train_info[0]).mean() + train_acc1 = np.array(train_info[1]).mean() + train_acc5 = np.array(train_info[2]).mean() + pass_speed = pass_num / pass_time + train_speed.append(pass_speed) + if pass_id == num_passes - 1: + train_acc_top1_kpi.add_record(train_acc1) + train_acc_top5_kpi.add_record(train_acc5) + train_cost_kpi.add_record(train_loss) + mean_pass_speed = np.array(pass_speed).mean() + train_speed_kpi.add_record(mean_pass_speed) + for data in test_reader(): + t1 = time.time() + loss, acc1, acc5 = test_exe.run(fetch_list, feed=feeder.feed(data)) + t2 = time.time() + period = t2 - t1 + loss = np.mean(np.array(loss)) + acc1 = np.mean(np.array(acc1)) + acc5 = np.mean(np.array(acc5)) + test_info[0].append(loss) + test_info[1].append(acc1) + test_info[2].append(acc5) + if batch_id % 10 == 0: + print("Pass {0},testbatch {1},loss {2}, \ + acc1 {3},acc5 {4},time {5}" + .format(pass_id, \ + batch_id, loss, acc1, acc5, \ + "%2.2f sec" % period)) + sys.stdout.flush() + + test_loss = np.array(test_info[0]).mean() + test_acc1 = np.array(test_info[1]).mean() + test_acc5 = np.array(test_info[2]).mean() + + print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ + test_loss {4}, test_acc1 {5}, test_acc5 {6}, pass_time {7}, train_speed {8}" + .format(pass_id, \ + train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ + test_acc5, pass_time, pass_num / pass_time)) + sys.stdout.flush() + train_acc_top1_kpi.persist() + train_acc_top5_kpi.persist() + train_cost_kpi.persist() + train_speed_kpi.persist() + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + + total_images = 1281167 + batch_size = args.batch_size + step = int(total_images / batch_size + 1) + num_epochs = 5 + + learning_rate_mode = args.lr_strategy + lr_strategy = {} + if learning_rate_mode == "piecewise_decay": + epoch_points = [30, 60, 90] + bd = [e * step for e in epoch_points] + lr = [0.1, 0.01, 0.001, 0.0001] + lr_strategy[learning_rate_mode] = {"bd": bd, "lr": lr} + elif learning_rate_mode == "cosine_decay": + lr_strategy[learning_rate_mode] = { + "step_each_epoch": step, + "epochs": num_epochs + } + else: + lr_strategy = None + + use_nccl = True + # layers: 50, 152 + layers = args.num_layers + method = train_parallel_exe if args.parallel_exe else train_parallel_do + init_model = args.init_model if args.init_model else None + pretrained_model = args.pretrained_model if args.pretrained_model else None + method( + args, + learning_rate=0.1, + batch_size=batch_size, + num_passes=num_epochs, + init_model=init_model, + pretrained_model=pretrained_model, + parallel=True, + use_nccl=True, + lr_strategy=lr_strategy, + layers=layers) diff --git a/image_classification/utility.py b/image_classification/utility.py new file mode 100644 index 00000000..506e6007 --- /dev/null +++ b/image_classification/utility.py @@ -0,0 +1,62 @@ +"""Contains common utility functions.""" +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import distutils.util +import numpy as np +from paddle.fluid import core + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) diff --git a/language_model/README.md b/language_model/README.md new file mode 100644 index 00000000..91ce2d7f --- /dev/null +++ b/language_model/README.md @@ -0,0 +1,148 @@ +# 语言模型 + +以下是本例的简要目录结构及说明: + +```text +. +├── README.md # 文档 +├── train.py # 训练脚本 +├── infer.py # 预测脚本 +└── utils.py # 通用函数 +``` + + +## 简介 + +循环神经网络语言模型的介绍可以参阅论文[Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329),在本例中,我们实现了GRU-RNN语言模型。 + +## 训练 + +运行命令 `python train.py` 开始训练模型。 +```python +python train.py +``` + +当前支持的参数可参见[train.py](./train.py) `train_net` 函数 +```python +vocab, train_reader, test_reader = utils.prepare_data( + batch_size=20, # batch size + buffer_size=1000, # buffer size, default value is OK + word_freq_threshold=0) # vocabulary related parameter, and words with frequency below this value will be filtered + +train(train_reader=train_reader, + vocab=vocab, + network=network, + hid_size=200, # embedding and hidden size + base_lr=1.0, # base learning rate + batch_size=20, # batch size, the same as that in prepare_data + pass_num=12, # the number of passes for training + use_cuda=True, # whether to use GPU card + parallel=False, # whether to be parallel + model_dir="model", # directory to save model + init_low_bound=-0.1, # uniform parameter initialization lower bound + init_high_bound=0.1) # uniform parameter initialization upper bound +``` + +## 自定义网络结构 + +可在[train.py](./train.py) `network` 函数中调整网络结构,当前的网络结构如下: +```python +emb = fluid.layers.embedding(input=src, size=[vocab_size, hid_size], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), + learning_rate=emb_lr_x), + is_sparse=True) + +fc0 = fluid.layers.fc(input=emb, size=hid_size * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) +gru_h0 = fluid.layers.dynamic_gru(input=fc0, size=hid_size, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) + +fc = fluid.layers.fc(input=gru_h0, size=vocab_size, act='softmax', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), + learning_rate=fc_lr_x)) + +cost = fluid.layers.cross_entropy(input=fc, label=dst) +``` + +## 训练结果示例 + +我们在Tesla K40m单GPU卡上训练的日志如下所示 +```text +epoch_1 start +step:100 ppl:771.053 +step:200 ppl:449.597 +step:300 ppl:642.654 +step:400 ppl:458.128 +step:500 ppl:510.912 +step:600 ppl:451.545 +step:700 ppl:364.404 +step:800 ppl:324.272 +step:900 ppl:360.797 +step:1000 ppl:275.761 +step:1100 ppl:294.599 +step:1200 ppl:335.877 +step:1300 ppl:185.262 +step:1400 ppl:241.744 +step:1500 ppl:211.507 +step:1600 ppl:233.431 +step:1700 ppl:298.767 +step:1800 ppl:203.403 +step:1900 ppl:158.828 +step:2000 ppl:171.148 +step:2100 ppl:280.884 +epoch:1 num_steps:2104 time_cost(s):47.478780 +model saved in model/epoch_1 +epoch_2 start +step:100 ppl:238.099 +step:200 ppl:136.527 +step:300 ppl:204.184 +step:400 ppl:252.886 +step:500 ppl:177.377 +step:600 ppl:197.688 +step:700 ppl:131.650 +step:800 ppl:223.906 +step:900 ppl:144.785 +step:1000 ppl:176.286 +step:1100 ppl:148.158 +step:1200 ppl:203.581 +step:1300 ppl:168.208 +step:1400 ppl:159.412 +step:1500 ppl:114.032 +step:1600 ppl:157.985 +step:1700 ppl:147.743 +step:1800 ppl:88.676 +step:1900 ppl:141.962 +step:2000 ppl:106.087 +step:2100 ppl:122.709 +epoch:2 num_steps:2104 time_cost(s):47.583789 +model saved in model/epoch_2 +... +``` + +## 预测 +运行命令 `python infer.py model_dir start_epoch last_epoch(inclusive)` 开始预测,其中,start_epoch指定开始预测的轮次,last_epoch指定结束的轮次,例如 +```python +python infer.py model 1 12 # prediction from epoch 1 to epoch 12 +``` + +## 预测结果示例 +```text +model:model/epoch_1 ppl:254.540 time_cost(s):3.29 +model:model/epoch_2 ppl:177.671 time_cost(s):3.27 +model:model/epoch_3 ppl:156.251 time_cost(s):3.27 +model:model/epoch_4 ppl:139.036 time_cost(s):3.27 +model:model/epoch_5 ppl:132.661 time_cost(s):3.27 +model:model/epoch_6 ppl:130.092 time_cost(s):3.28 +model:model/epoch_7 ppl:128.751 time_cost(s):3.27 +model:model/epoch_8 ppl:125.411 time_cost(s):3.27 +model:model/epoch_9 ppl:124.604 time_cost(s):3.28 +model:model/epoch_10 ppl:124.754 time_cost(s):3.29 +model:model/epoch_11 ppl:125.421 time_cost(s):3.27 +model:model/epoch_12 ppl:125.676 time_cost(s):3.27 +``` diff --git a/language_model/continuous_evaluation.py b/language_model/continuous_evaluation.py new file mode 100644 index 00000000..a7bc53bd --- /dev/null +++ b/language_model/continuous_evaluation.py @@ -0,0 +1,17 @@ +""" +continuous_evaluation.py +""" +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi + +imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0) +imikolov_20_pass_duration_kpi = DurationKpi('imikolov_20_pass_duration', 0.02, + 0, actived=True) + +tracking_kpis = [ + imikolov_20_avg_ppl_kpi, + imikolov_20_pass_duration_kpi, +] diff --git a/language_model/infer.py b/language_model/infer.py new file mode 100644 index 00000000..a183d548 --- /dev/null +++ b/language_model/infer.py @@ -0,0 +1,65 @@ +import sys +import time +import math +import unittest +import contextlib +import numpy as np + +import paddle.fluid as fluid +import paddle.v2 as paddle + +import utils + + +def infer(test_reader, use_cuda, model_path): + """ inference function """ + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + with fluid.scope_guard(fluid.core.Scope()): + infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model( + model_path, exe) + + accum_cost = 0.0 + accum_words = 0 + t0 = time.time() + for data in test_reader(): + src_wordseq = utils.to_lodtensor(map(lambda x: x[0], data), place) + dst_wordseq = utils.to_lodtensor(map(lambda x: x[1], data), place) + avg_cost = exe.run( + infer_program, + feed={"src_wordseq": src_wordseq, + "dst_wordseq": dst_wordseq}, + fetch_list=fetch_vars) + + nwords = src_wordseq.lod()[0][-1] + + cost = np.array(avg_cost) * nwords + accum_cost += cost + accum_words += nwords + + ppl = math.exp(accum_cost / accum_words) + t1 = time.time() + print("model:%s ppl:%.3f time_cost(s):%.2f" % + (model_path, ppl, t1 - t0)) + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: %s model_dir start_epoch last_epoch(inclusive)") + exit(0) + + model_dir = sys.argv[1] + try: + start_index = int(sys.argv[2]) + last_index = int(sys.argv[3]) + except: + print("Usage: %s model_dir start_epoch last_epoch(inclusive)") + exit(-1) + + vocab, train_reader, test_reader = utils.prepare_data( + batch_size=20, buffer_size=1000, word_freq_threshold=0) + + for epoch in xrange(start_index, last_index + 1): + epoch_path = model_dir + "/epoch_" + str(epoch) + infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path) diff --git a/language_model/latest_kpis/imikolov_20_avg_ppl_factor.txt b/language_model/latest_kpis/imikolov_20_avg_ppl_factor.txt new file mode 100644 index 00000000..b570c294 --- /dev/null +++ b/language_model/latest_kpis/imikolov_20_avg_ppl_factor.txt @@ -0,0 +1 @@ +[32.465272032979705] diff --git a/language_model/latest_kpis/imikolov_20_pass_duration_factor.txt b/language_model/latest_kpis/imikolov_20_pass_duration_factor.txt new file mode 100644 index 00000000..c9ab10b4 --- /dev/null +++ b/language_model/latest_kpis/imikolov_20_pass_duration_factor.txt @@ -0,0 +1 @@ +[29.741339857578278] diff --git a/language_model/run.xsh b/language_model/run.xsh new file mode 100755 index 00000000..5a40853d --- /dev/null +++ b/language_model/run.xsh @@ -0,0 +1,8 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +cudaid=${language_model:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py diff --git a/language_model/train.py b/language_model/train.py new file mode 100644 index 00000000..773c7431 --- /dev/null +++ b/language_model/train.py @@ -0,0 +1,171 @@ +import sys +import time + +import numpy as np +import math + +import paddle.fluid as fluid +import paddle + +import utils + +from continuous_evaluation import imikolov_20_avg_ppl_kpi, imikolov_20_pass_duration_kpi + + +def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): + """ network definition """ + emb_lr_x = 10.0 + gru_lr_x = 1.0 + fc_lr_x = 1.0 + emb = fluid.layers.embedding( + input=src, + size=[vocab_size, hid_size], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=emb_lr_x), + is_sparse=True) + + fc0 = fluid.layers.fc(input=emb, + size=hid_size * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) + gru_h0 = fluid.layers.dynamic_gru( + input=fc0, + size=hid_size, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) + + fc = fluid.layers.fc(input=gru_h0, + size=vocab_size, + act='softmax', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=fc_lr_x)) + + cost = fluid.layers.cross_entropy(input=fc, label=dst) + return cost + + +def train(train_reader, + vocab, + network, + hid_size, + base_lr, + batch_size, + pass_num, + use_cuda, + parallel, + model_dir, + init_low_bound=-0.04, + init_high_bound=0.04): + """ train network """ + vocab_size = len(vocab) + + src_wordseq = fluid.layers.data( + name="src_wordseq", shape=[1], dtype="int64", lod_level=1) + dst_wordseq = fluid.layers.data( + name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + + avg_cost = None + if not parallel: + cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, + init_low_bound, init_high_bound) + avg_cost = fluid.layers.mean(x=cost) + else: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + cost = network( + pd.read_input(src_wordseq), + pd.read_input(dst_wordseq), vocab_size, hid_size, + init_low_bound, init_high_bound) + pd.write_output(cost) + + cost = pd() + avg_cost = fluid.layers.mean(x=cost) + + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=base_lr, + decay_steps=2100 * 4, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + exe.run(fluid.default_startup_program()) + total_time = 0.0 + for pass_idx in xrange(pass_num): + epoch_idx = pass_idx + 1 + print "epoch_%d start" % epoch_idx + + t0 = time.time() + i = 0 + newest_ppl = 0 + for data in train_reader(): + i += 1 + lod_src_wordseq = utils.to_lodtensor( + map(lambda x: x[0], data), place) + lod_dst_wordseq = utils.to_lodtensor( + map(lambda x: x[1], data), place) + ret_avg_cost = exe.run(fluid.default_main_program(), + feed={ + "src_wordseq": lod_src_wordseq, + "dst_wordseq": lod_dst_wordseq + }, + fetch_list=[avg_cost], + use_program_cache=True) + avg_ppl = math.exp(ret_avg_cost[0]) + newest_ppl = avg_ppl + if i % 100 == 0: + print "step:%d ppl:%.3f" % (i, avg_ppl) + + t1 = time.time() + total_time += t1 - t0 + print "epoch:%d num_steps:%d time_cost(s):%f" % ( + epoch_idx, i, total_time / epoch_idx) + + if pass_idx == pass_num - 1: + imikolov_20_pass_duration_kpi.add_record(total_time / epoch_idx) + imikolov_20_avg_ppl_kpi.add_record(newest_ppl) + save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) + feed_var_names = ["src_wordseq", "dst_wordseq"] + fetch_vars = [avg_cost] + fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, + exe) + print("model saved in %s" % save_dir) + imikolov_20_pass_duration_kpi.persist() + imikolov_20_avg_ppl_kpi.persist() + print("finish training") + + +def train_net(): + """ do training """ + batch_size = 20 + vocab, train_reader, test_reader = utils.prepare_data( + batch_size=batch_size, buffer_size=1000, word_freq_threshold=0) + train( + train_reader=train_reader, + vocab=vocab, + network=network, + hid_size=200, + base_lr=1.0, + batch_size=batch_size, + pass_num=12, + use_cuda=True, + parallel=True, + model_dir="model", + init_low_bound=-0.1, + init_high_bound=0.1) + + +if __name__ == "__main__": + train_net() diff --git a/language_model/utils.py b/language_model/utils.py new file mode 100644 index 00000000..9ca0ef4d --- /dev/null +++ b/language_model/utils.py @@ -0,0 +1,41 @@ +import sys +import time +import numpy as np + +import paddle.fluid as fluid +import paddle + + +def to_lodtensor(data, place): + """ convert to LODtensor """ + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0): + """ prepare the English Pann Treebank (PTB) data """ + vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold) + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imikolov.train( + vocab, + buffer_size, + data_type=paddle.dataset.imikolov.DataType.SEQ), + buf_size=buffer_size), + batch_size) + test_reader = paddle.batch( + paddle.dataset.imikolov.test( + vocab, buffer_size, + data_type=paddle.dataset.imikolov.DataType.SEQ), + batch_size) + return vocab, train_reader, test_reader diff --git a/lstm/continuous_evaluation.py b/lstm/continuous_evaluation.py new file mode 100644 index 00000000..e7efb66a --- /dev/null +++ b/lstm/continuous_evaluation.py @@ -0,0 +1,17 @@ +""" +continuous_evaluation.py +""" +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import AccKpi +from kpi import CostKpi +from kpi import DurationKpi + +imdb_32_train_speed_kpi = AccKpi('imdb_32_train_speed', 0.03, 0, actived=True) +imdb_32_gpu_memory_kpi = DurationKpi('imdb_32_gpu_memory', 0.05, 0, actived=True) + +tracking_kpis = [ + imdb_32_train_speed_kpi, + imdb_32_gpu_memory_kpi, +] diff --git a/lstm/get_gpu_data.py b/lstm/get_gpu_data.py new file mode 100644 index 00000000..7afb3fdb --- /dev/null +++ b/lstm/get_gpu_data.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +######################################################################## +# +# Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved +# +######################################################################## +""" +File: get_gpu_data.py +Author: paddle(paddle@baidu.com) +Date: 2018/04/02 15:57:14 +""" +import argparse +from continuous_evaluation import tracking_kpis + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--data_set', + type=str, + default='imdb', + help='Optional dataset for benchmark.') +args = parser.parse_args() + + +def save_gpu_data(): + mem_list = [] + with open('memory.txt', 'r') as f: + for i, data in enumerate(f.readlines()): + if i == 0: + continue + mem_list.append(int(data.split("\n")[0].split(" ")[0])) + gpu_memory_factor = None + for kpi in tracking_kpis: + if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size): + gpu_memory_kpi = kpi + gpu_memory_kpi.add_record(max(mem_list)) + gpu_memory_kpi.persist() + + +if __name__ == "__main__": + save_gpu_data() diff --git a/lstm/latest_kpis/imdb_32_gpu_memory_factor.txt b/lstm/latest_kpis/imdb_32_gpu_memory_factor.txt new file mode 100644 index 00000000..7a9fb042 --- /dev/null +++ b/lstm/latest_kpis/imdb_32_gpu_memory_factor.txt @@ -0,0 +1 @@ +[1560] diff --git a/lstm/latest_kpis/imdb_32_train_speed_factor.txt b/lstm/latest_kpis/imdb_32_train_speed_factor.txt new file mode 100644 index 00000000..1f3013cb --- /dev/null +++ b/lstm/latest_kpis/imdb_32_train_speed_factor.txt @@ -0,0 +1 @@ +[779.2451171875] diff --git a/lstm/model.py b/lstm/model.py new file mode 100644 index 00000000..418a79b6 --- /dev/null +++ b/lstm/model.py @@ -0,0 +1,283 @@ +""" +stacked_dynamic_lstm model for fluid +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import cPickle +import os +import random +import commands +import subprocess +import threading +import time +import numpy as np + +import numpy +import paddle +import paddle.dataset.imdb as imdb +import paddle.fluid as fluid +import paddle.batch as batch +import paddle.fluid.profiler as profiler + +from continuous_evaluation import tracking_kpis + + +def parse_args(): + parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', + type=int, + default=80, + help='The number of minibatches.') + parser.add_argument( + '--emb_dim', + type=int, + default=512, + help='Dimension of embedding table. (default: %(default)d)') + parser.add_argument( + '--hidden_dim', + type=int, + default=512, + help='Hidden size of lstm unit. (default: %(default)d)') + parser.add_argument( + '--pass_num', + type=int, + default=100, + help='Epoch number to train. (default: %(default)d)') + parser.add_argument( + '--device', + type=str, + default='CPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpu_id', + type=int, + default=3, + help='The GPU Card Id. (default: %(default)d)') + parser.add_argument( + '--crop_size', + type=int, + default=int(os.environ.get('CROP_SIZE', '1500')), + help='The max sentence length of input. Since this model use plain RNN,' + ' Gradient could be explored if sentence is too long') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') + args = parser.parse_args() + return args + + +word_dict = imdb.word_dict() + + +def crop_sentence(reader, crop_size): + unk_value = word_dict[''] + + def __impl__(): + for item in reader(): + if len([x for x in item[0] if x != unk_value]) < crop_size: + yield item + + return __impl__ + + +def main(): + args = parse_args() + lstm_size = args.hidden_dim + + data = fluid.layers.data( + name="words", shape=[1], lod_level=1, dtype='int64') + sentence = fluid.layers.embedding( + input=data, size=[len(word_dict), args.emb_dim]) + + sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + word = rnn.step_input(sentence) + prev_hidden = rnn.memory(value=0.0, shape=[lstm_size]) + prev_cell = rnn.memory(value=0.0, shape=[lstm_size]) + + def gate_common( + ipt, + hidden, + size, ): + gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True) + gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False) + gate = fluid.layers.sums(input=[gate0, gate1]) + return gate + + forget_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + input_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + output_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + cell_gate = fluid.layers.tanh( + x=gate_common(word, prev_hidden, lstm_size)) + + cell = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul( + x=input_gate, y=cell_gate) + ]) + + hidden = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell)) + + rnn.update_memory(prev_cell, cell) + rnn.update_memory(prev_hidden, hidden) + rnn.output(hidden) + + last = fluid.layers.sequence_pool(rnn(), 'last') + logit = fluid.layers.fc(input=last, size=2, act='softmax') + loss = fluid.layers.cross_entropy( + input=logit, + label=fluid.layers.data( + name='label', shape=[1], dtype='int64')) + loss = fluid.layers.mean(x=loss) + + # add acc + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \ + shape=[1], dtype='int64'), total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + adam = fluid.optimizer.Adam() + adam.minimize(loss) + + fluid.memory_optimize(fluid.default_main_program()) + + place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + train_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.train(word_dict), args.crop_size), + buf_size=25000), + batch_size=args.batch_size) + + train_acc_kpi = None + for kpi in tracking_kpis: + if kpi.name == 'imdb_%s_train_acc' % (args.batch_size): + train_acc_kpi = kpi + train_speed_kpi = None + for kpi in tracking_kpis: + if kpi.name == 'imdb_%s_train_speed' % (args.batch_size): + train_speed_kpi = kpi + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + tensor_words = to_lodtensor([x[0] for x in data], place) + label = numpy.array([x[1] for x in data]).astype("int64") + label = label.reshape((-1, 1)) + loss_np, acc, weight = exe.run( + fluid.default_main_program(), + feed={"words": tensor_words, + "label": label}, + fetch_list=[loss, batch_acc, batch_size_tensor]) + iters += 1 + for x in data: + num_samples += len(x[0]) + print( + "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss_np, acc) + ) # The accuracy is the accumulation of batches, but not the current batch. + + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32')) + break + train_speed_kpi.persist() + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = numpy.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def print_arguments(args): + print('----------- lstm Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def collect_gpu_memory_data(alive): + """ + collect the GPU memory data + """ + global is_alive + status, output = commands.getstatusoutput('rm -rf memory.txt') + if status == 0: + print('del memory.txt') + command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id + p = subprocess.Popen(command, shell=True) + if p.pid < 0: + print('Get GPU memory data error') + while (is_alive): + time.sleep(1) + p.kill() + + +def save_gpu_data(mem_list): + gpu_memory_kpi = None + for kpi in tracking_kpis: + if kpi.name == 'imdb_%s_gpu_memory' % (args.batch_size): + gpu_memory_kpi = kpi + gpu_memory_kpi.add_record(max(mem_list)) + gpu_memory_kpi.persist() + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + global is_alive + is_alive = True + collect_memory_thread = threading.Thread( + target=collect_gpu_memory_data, args=(is_alive, )) + collect_memory_thread.setDaemon(True) + collect_memory_thread.start() + main() + is_alive = False diff --git a/lstm/run.xsh b/lstm/run.xsh new file mode 100755 index 00000000..d184f534 --- /dev/null +++ b/lstm/run.xsh @@ -0,0 +1,14 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +cudaid=${lstm_cudaid:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +#imdb 32 +FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=32 --iterations=50 --gpu_id=$cudaid +python get_gpu_data.py --batch_size=32 --data_set=imdb +for pid in $(ps -ef | grep nvidia-smi | grep -v grep | cut -c 9-15); do + echo $pid + kill -9 $pid +done diff --git a/mnist/continuous_evaluation.py b/mnist/continuous_evaluation.py index 0698887b..25047026 100644 --- a/mnist/continuous_evaluation.py +++ b/mnist/continuous_evaluation.py @@ -3,12 +3,14 @@ sys.path.append(os.environ['ceroot']) from kpi import CostKpi, DurationKpi, AccKpi -train_acc_kpi = AccKpi('train_acc', 0.05) -test_acc_kpi = AccKpi('test_acc', 0.05) -train_duration_kpi = DurationKpi('train_duration', 0.1) +train_cost_kpi = CostKpi('train_cost', 0.02, actived=True) +test_acc_kpi = AccKpi('test_acc', 0.005, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.02, actived=True) +train_acc_kpi = AccKpi('train_acc', 0.005, actived=True) tracking_kpis = [ train_acc_kpi, + train_cost_kpi, test_acc_kpi, train_duration_kpi, ] diff --git a/mnist/latest_kpis/test_acc_factor.txt b/mnist/latest_kpis/test_acc_factor.txt index f1fd6659..bdfcae70 100644 --- a/mnist/latest_kpis/test_acc_factor.txt +++ b/mnist/latest_kpis/test_acc_factor.txt @@ -1,5 +1,5 @@ -[0.9749000072479248] -[0.9811000227928162] -[0.9858999848365784] -[0.9860000014305115] -[0.9872000217437744] \ No newline at end of file +[0.9768999814987183] +[0.9839000105857849] +[0.9868000149726868] +[0.9866999983787537] +[0.9879000186920166] diff --git a/mnist/latest_kpis/train_acc_factor.txt b/mnist/latest_kpis/train_acc_factor.txt index 55a945e1..856ba0ff 100644 --- a/mnist/latest_kpis/train_acc_factor.txt +++ b/mnist/latest_kpis/train_acc_factor.txt @@ -1,5 +1,5 @@ -[0.9435666799545288] -[0.982283353805542] -[0.9876833558082581] -[0.9906833171844482] -[0.9932000041007996] \ No newline at end of file +[0.9471499919891357] +[0.9831333160400391] +[0.9886166453361511] +[0.9915000200271606] +[0.9929666519165039] diff --git a/mnist/latest_kpis/train_cost_factor.txt b/mnist/latest_kpis/train_cost_factor.txt new file mode 100644 index 00000000..7d7bd861 --- /dev/null +++ b/mnist/latest_kpis/train_cost_factor.txt @@ -0,0 +1,5 @@ +[0.05625442788004875] +[0.0373283299320031] +[0.0393865630030632] +[0.029800457879900932] +[0.02382788062095642] diff --git a/mnist/latest_kpis/train_duration_factor.txt b/mnist/latest_kpis/train_duration_factor.txt index 8253e948..ee89488a 100644 --- a/mnist/latest_kpis/train_duration_factor.txt +++ b/mnist/latest_kpis/train_duration_factor.txt @@ -1,5 +1,5 @@ -[38.24392104148865] -[36.998713970184326] -[36.87090182304382] -[36.75976610183716] -[36.79504203796387] \ No newline at end of file +[36.52754783630371] +[36.04332995414734] +[36.20732808113098] +[36.188393115997314] +[35.95417380332947] diff --git a/mnist/model.py b/mnist/model.py index f180f3b5..1ae83922 100644 --- a/mnist/model.py +++ b/mnist/model.py @@ -10,13 +10,13 @@ import paddle.fluid as fluid import paddle.fluid.profiler as profiler -from continuous_evaluation import (train_acc_kpi, test_acc_kpi, +from continuous_evaluation import (train_acc_kpi, train_cost_kpi, test_acc_kpi, train_duration_kpi, tracking_kpis) -SEED = 1 +SEED = 90 DTYPE = "float32" # random seed must set before configuring the network. -# fluid.default_startup_program().random_seed = SEED +fluid.default_startup_program().random_seed = SEED def parse_args(): @@ -49,8 +49,8 @@ def parse_args(): def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] - and vars(args)['device'] == 'GPU') + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') print('----------- Configuration Arguments -----------') for arg, value in sorted(vars(args).iteritems()): print('%s: %s' % (arg, value)) @@ -99,13 +99,10 @@ def eval_test(exe, batch_acc, batch_size_tensor, inference_program): y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([len(y_data), 1]) - acc, weight = exe.run( - inference_program, - feed={ - "pixel": img_data, - "label": y_data - }, - fetch_list=[batch_acc, batch_size_tensor]) + acc, weight = exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) test_pass_acc.add(value=acc, weight=weight) pass_acc = test_pass_acc.eval() return pass_acc @@ -158,6 +155,7 @@ def run_benchmark(model, args): for pass_id in range(args.pass_num): accuracy.reset() pass_start = time.time() + every_pass_loss = [] for batch_id, data in enumerate(train_reader()): img_data = np.array( map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) @@ -165,29 +163,30 @@ def run_benchmark(model, args): y_data = y_data.reshape([len(y_data), 1]) start = time.time() - outs = exe.run( + loss, acc, weight = exe.run( fluid.default_main_program(), - feed={ - "pixel": img_data, - "label": y_data - }, + feed={"pixel": img_data, + "label": y_data}, fetch_list=[avg_cost, batch_acc, batch_size_tensor] ) # The accuracy is the accumulation of batches, but not the current batch. - accuracy.add(value=outs[1], weight=outs[2]) end = time.time() - loss = np.array(outs[0]) - acc = np.array(outs[1]) + accuracy.add(value=acc, weight=weight) + every_pass_loss.append(loss) + print ("Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, batch_id, loss, acc)) pass_end = time.time() train_avg_acc = accuracy.eval() + train_avg_loss = np.mean(every_pass_loss) test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, inference_program) - print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" % - (pass_id, train_avg_acc, test_avg_acc, (pass_end - pass_start))) + print("pass=%d, train_avg_acc=%f,train_avg_loss=%f, test_avg_acc=%f, elapse=%f" % + (pass_id, train_avg_acc, train_avg_loss, test_avg_acc, (pass_end - pass_start))) train_acc_kpi.add_record(np.array(train_avg_acc, dtype='float32')) + train_cost_kpi.add_record(np.array(train_avg_loss, dtype='float32')) test_acc_kpi.add_record(np.array(test_avg_acc, dtype='float32')) train_duration_kpi.add_record(pass_end - pass_start) diff --git a/mnist/run.xsh b/mnist/run.xsh index 65f75488..504d3c63 100755 --- a/mnist/run.xsh +++ b/mnist/run.xsh @@ -2,5 +2,4 @@ import sys model_file = 'model.py' - python @(model_file) --batch_size 128 --pass_num 5 --device CPU diff --git a/object_detection/continuous_evaluation.py b/object_detection/continuous_evaluation.py new file mode 100644 index 00000000..465a7985 --- /dev/null +++ b/object_detection/continuous_evaluation.py @@ -0,0 +1,10 @@ +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True) +train_speed_kpi = AccKpi('train_speed', 0.02, 0, actived=True) +four_card_speed_kpi = AccKpi('four_card_train_speed', 0.02, 0, actived=True) + +tracking_kpis = [train_cost_kpi, train_speed_kpi, four_card_speed_kpi] diff --git a/object_detection/download.sh b/object_detection/download.sh new file mode 100755 index 00000000..fe483255 --- /dev/null +++ b/object_detection/download.sh @@ -0,0 +1,19 @@ +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd "$DIR" + +# Download the data. +cp labels/* data/pascalvoc/ +cd data/pascalvoc + +echo "Downloading..." +wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar +wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar +wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar +# Extract the data. +echo "Extractint..." +tar -xf VOCtrainval_11-May-2012.tar +tar -xf VOCtrainval_06-Nov-2007.tar +tar -xf VOCtest_06-Nov-2007.tar + +echo "Creating data lists..." +python create_list.py diff --git a/object_detection/image_util.py b/object_detection/image_util.py new file mode 100644 index 00000000..4ce53048 --- /dev/null +++ b/object_detection/image_util.py @@ -0,0 +1,234 @@ +from PIL import Image, ImageEnhance +import numpy as np +import random +import math + + +class sampler(): + def __init__(self, max_sample, max_trial, min_scale, max_scale, + min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap, + max_jaccard_overlap): + self.max_sample = max_sample + self.max_trial = max_trial + self.min_scale = min_scale + self.max_scale = max_scale + self.min_aspect_ratio = min_aspect_ratio + self.max_aspect_ratio = max_aspect_ratio + self.min_jaccard_overlap = min_jaccard_overlap + self.max_jaccard_overlap = max_jaccard_overlap + + +class bbox(): + def __init__(self, xmin, ymin, xmax, ymax): + self.xmin = xmin + self.ymin = ymin + self.xmax = xmax + self.ymax = ymax + + +def bbox_area(src_bbox): + width = src_bbox.xmax - src_bbox.xmin + height = src_bbox.ymax - src_bbox.ymin + return width * height + + +def generate_sample(sampler): + scale = random.uniform(sampler.min_scale, sampler.max_scale) + min_aspect_ratio = max(sampler.min_aspect_ratio, (scale**2.0)) + max_aspect_ratio = min(sampler.max_aspect_ratio, 1 / (scale**2.0)) + aspect_ratio = random.uniform(min_aspect_ratio, max_aspect_ratio) + bbox_width = scale * (aspect_ratio**0.5) + bbox_height = scale / (aspect_ratio**0.5) + xmin_bound = 1 - bbox_width + ymin_bound = 1 - bbox_height + xmin = random.uniform(0, xmin_bound) + ymin = random.uniform(0, ymin_bound) + xmax = xmin + bbox_width + ymax = ymin + bbox_height + sampled_bbox = bbox(xmin, ymin, xmax, ymax) + return sampled_bbox + + +def jaccard_overlap(sample_bbox, object_bbox): + if sample_bbox.xmin >= object_bbox.xmax or \ + sample_bbox.xmax <= object_bbox.xmin or \ + sample_bbox.ymin >= object_bbox.ymax or \ + sample_bbox.ymax <= object_bbox.ymin: + return 0 + intersect_xmin = max(sample_bbox.xmin, object_bbox.xmin) + intersect_ymin = max(sample_bbox.ymin, object_bbox.ymin) + intersect_xmax = min(sample_bbox.xmax, object_bbox.xmax) + intersect_ymax = min(sample_bbox.ymax, object_bbox.ymax) + intersect_size = (intersect_xmax - intersect_xmin) * ( + intersect_ymax - intersect_ymin) + sample_bbox_size = bbox_area(sample_bbox) + object_bbox_size = bbox_area(object_bbox) + overlap = intersect_size / ( + sample_bbox_size + object_bbox_size - intersect_size) + return overlap + + +def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels): + if sampler.min_jaccard_overlap == 0 and sampler.max_jaccard_overlap == 0: + return True + for i in range(len(bbox_labels)): + object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2], + bbox_labels[i][3], bbox_labels[i][4]) + overlap = jaccard_overlap(sample_bbox, object_bbox) + if sampler.min_jaccard_overlap != 0 and \ + overlap < sampler.min_jaccard_overlap: + continue + if sampler.max_jaccard_overlap != 0 and \ + overlap > sampler.max_jaccard_overlap: + continue + return True + return False + + +def generate_batch_samples(batch_sampler, bbox_labels): + sampled_bbox = [] + index = [] + c = 0 + for sampler in batch_sampler: + found = 0 + for i in range(sampler.max_trial): + if found >= sampler.max_sample: + break + sample_bbox = generate_sample(sampler) + if satisfy_sample_constraint(sampler, sample_bbox, bbox_labels): + sampled_bbox.append(sample_bbox) + found = found + 1 + index.append(c) + c = c + 1 + return sampled_bbox + + +def clip_bbox(src_bbox): + src_bbox.xmin = max(min(src_bbox.xmin, 1.0), 0.0) + src_bbox.ymin = max(min(src_bbox.ymin, 1.0), 0.0) + src_bbox.xmax = max(min(src_bbox.xmax, 1.0), 0.0) + src_bbox.ymax = max(min(src_bbox.ymax, 1.0), 0.0) + return src_bbox + + +def meet_emit_constraint(src_bbox, sample_bbox): + center_x = (src_bbox.xmax + src_bbox.xmin) / 2 + center_y = (src_bbox.ymax + src_bbox.ymin) / 2 + if center_x >= sample_bbox.xmin and \ + center_x <= sample_bbox.xmax and \ + center_y >= sample_bbox.ymin and \ + center_y <= sample_bbox.ymax: + return True + return False + + +def transform_labels(bbox_labels, sample_bbox): + proj_bbox = bbox(0, 0, 0, 0) + sample_labels = [] + for i in range(len(bbox_labels)): + sample_label = [] + object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2], + bbox_labels[i][3], bbox_labels[i][4]) + if not meet_emit_constraint(object_bbox, sample_bbox): + continue + sample_width = sample_bbox.xmax - sample_bbox.xmin + sample_height = sample_bbox.ymax - sample_bbox.ymin + proj_bbox.xmin = (object_bbox.xmin - sample_bbox.xmin) / sample_width + proj_bbox.ymin = (object_bbox.ymin - sample_bbox.ymin) / sample_height + proj_bbox.xmax = (object_bbox.xmax - sample_bbox.xmin) / sample_width + proj_bbox.ymax = (object_bbox.ymax - sample_bbox.ymin) / sample_height + proj_bbox = clip_bbox(proj_bbox) + if bbox_area(proj_bbox) > 0: + sample_label.append(bbox_labels[i][0]) + sample_label.append(float(proj_bbox.xmin)) + sample_label.append(float(proj_bbox.ymin)) + sample_label.append(float(proj_bbox.xmax)) + sample_label.append(float(proj_bbox.ymax)) + sample_label.append(bbox_labels[i][5]) + sample_labels.append(sample_label) + return sample_labels + + +def crop_image(img, bbox_labels, sample_bbox, image_width, image_height): + sample_bbox = clip_bbox(sample_bbox) + xmin = int(sample_bbox.xmin * image_width) + xmax = int(sample_bbox.xmax * image_width) + ymin = int(sample_bbox.ymin * image_height) + ymax = int(sample_bbox.ymax * image_height) + sample_img = img[ymin:ymax, xmin:xmax] + sample_labels = transform_labels(bbox_labels, sample_bbox) + return sample_img, sample_labels + + +def random_brightness(img, settings): + prob = random.uniform(0, 1) + if prob < settings._brightness_prob: + delta = random.uniform(-settings._brightness_delta, + settings._brightness_delta) + 1 + img = ImageEnhance.Brightness(img).enhance(delta) + return img + + +def random_contrast(img, settings): + prob = random.uniform(0, 1) + if prob < settings._contrast_prob: + delta = random.uniform(-settings._contrast_delta, + settings._contrast_delta) + 1 + img = ImageEnhance.Contrast(img).enhance(delta) + return img + + +def random_saturation(img, settings): + prob = random.uniform(0, 1) + if prob < settings._saturation_prob: + delta = random.uniform(-settings._saturation_delta, + settings._saturation_delta) + 1 + img = ImageEnhance.Color(img).enhance(delta) + return img + + +def random_hue(img, settings): + prob = random.uniform(0, 1) + if prob < settings._hue_prob: + delta = random.uniform(-settings._hue_delta, settings._hue_delta) + img_hsv = np.array(img.convert('HSV')) + img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta + img = Image.fromarray(img_hsv, mode='HSV').convert('RGB') + return img + + +def distort_image(img, settings): + prob = random.uniform(0, 1) + # Apply different distort order + if prob > 0.5: + img = random_brightness(img, settings) + img = random_contrast(img, settings) + img = random_saturation(img, settings) + img = random_hue(img, settings) + else: + img = random_brightness(img, settings) + img = random_saturation(img, settings) + img = random_hue(img, settings) + img = random_contrast(img, settings) + return img + + +def expand_image(img, bbox_labels, img_width, img_height, settings): + prob = random.uniform(0, 1) + if prob < settings._expand_prob: + if settings._expand_max_ratio - 1 >= 0.01: + expand_ratio = random.uniform(1, settings._expand_max_ratio) + height = int(img_height * expand_ratio) + width = int(img_width * expand_ratio) + h_off = math.floor(random.uniform(0, height - img_height)) + w_off = math.floor(random.uniform(0, width - img_width)) + expand_bbox = bbox(-w_off / img_width, -h_off / img_height, + (width - w_off) / img_width, + (height - h_off) / img_height) + expand_img = np.ones((height, width, 3)) + expand_img = np.uint8(expand_img * np.squeeze(settings._img_mean)) + expand_img = Image.fromarray(expand_img) + expand_img.paste(img, (int(w_off), int(h_off))) + bbox_labels = transform_labels(bbox_labels, expand_bbox) + return expand_img, bbox_labels, width, height + return img, bbox_labels, img_width, img_height diff --git a/object_detection/labels/create_list.py b/object_detection/labels/create_list.py new file mode 100644 index 00000000..8b472aac --- /dev/null +++ b/object_detection/labels/create_list.py @@ -0,0 +1,66 @@ +import os +import os.path as osp +import re +import random + +devkit_dir = './VOCdevkit' +years = ['2007', '2012'] + + +def get_dir(devkit_dir, year, type): + return osp.join(devkit_dir, 'VOC' + year, type) + + +def walk_dir(devkit_dir, year): + filelist_dir = get_dir(devkit_dir, year, 'ImageSets/Main') + annotation_dir = get_dir(devkit_dir, year, 'Annotations') + img_dir = get_dir(devkit_dir, year, 'JPEGImages') + trainval_list = [] + test_list = [] + added = set() + + for _, _, files in os.walk(filelist_dir): + for fname in files: + img_ann_list = [] + if re.match('[a-z]+_trainval\.txt', fname): + img_ann_list = trainval_list + elif re.match('[a-z]+_test\.txt', fname): + img_ann_list = test_list + else: + continue + fpath = osp.join(filelist_dir, fname) + for line in open(fpath): + name_prefix = line.strip().split()[0] + if name_prefix in added: + continue + added.add(name_prefix) + ann_path = osp.join(annotation_dir, name_prefix + '.xml') + img_path = osp.join(img_dir, name_prefix + '.jpg') + assert os.path.isfile( + ann_path), 'file %s not found.' % ann_path + assert os.path.isfile( + img_path), 'file %s not found.' % img_path + img_ann_list.append((img_path, ann_path)) + + return trainval_list, test_list + + +def prepare_filelist(devkit_dir, years, output_dir): + trainval_list = [] + test_list = [] + for year in years: + trainval, test = walk_dir(devkit_dir, year) + trainval_list.extend(trainval) + test_list.extend(test) + random.shuffle(trainval_list) + with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval: + for item in trainval_list: + ftrainval.write(item[0] + ' ' + item[1] + '\n') + + with open(osp.join(output_dir, 'test.txt'), 'w') as ftest: + for item in test_list: + ftest.write(item[0] + ' ' + item[1] + '\n') + + +if __name__ == '__main__': + prepare_filelist(devkit_dir, years, '.') diff --git a/object_detection/labels/label_list b/object_detection/labels/label_list new file mode 100644 index 00000000..87df23ce --- /dev/null +++ b/object_detection/labels/label_list @@ -0,0 +1,21 @@ +background +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor diff --git a/object_detection/latest_kpis/four_card_train_speed_factor.txt b/object_detection/latest_kpis/four_card_train_speed_factor.txt new file mode 100644 index 00000000..8ba14137 --- /dev/null +++ b/object_detection/latest_kpis/four_card_train_speed_factor.txt @@ -0,0 +1 @@ +[143.63855412820158] \ No newline at end of file diff --git a/object_detection/latest_kpis/train_cost_factor.txt b/object_detection/latest_kpis/train_cost_factor.txt new file mode 100644 index 00000000..5b830a56 --- /dev/null +++ b/object_detection/latest_kpis/train_cost_factor.txt @@ -0,0 +1 @@ +[8.613137321472168] diff --git a/object_detection/latest_kpis/train_speed_factor.txt b/object_detection/latest_kpis/train_speed_factor.txt new file mode 100644 index 00000000..11510145 --- /dev/null +++ b/object_detection/latest_kpis/train_speed_factor.txt @@ -0,0 +1 @@ +[71.42477785941152] diff --git a/object_detection/mobilenet_ssd.py b/object_detection/mobilenet_ssd.py new file mode 100644 index 00000000..06c71d55 --- /dev/null +++ b/object_detection/mobilenet_ssd.py @@ -0,0 +1,116 @@ +import paddle as paddle +import paddle.fluid as fluid +from paddle.fluid.initializer import MSRA +from paddle.fluid.param_attr import ParamAttr + + +def conv_bn(input, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='relu', + use_cudnn=True): + parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA()) + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + act=None, + use_cudnn=use_cudnn, + param_attr=parameter_attr, + bias_attr=False) + parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA()) + bias_attr = ParamAttr(learning_rate=0.2) + return fluid.layers.batch_norm(input=conv, act=act) + + +def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride, + scale): + depthwise_conv = conv_bn( + input=input, + filter_size=3, + num_filters=int(num_filters1 * scale), + stride=stride, + padding=1, + num_groups=int(num_groups * scale), + use_cudnn=False) + + pointwise_conv = conv_bn( + input=depthwise_conv, + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + return pointwise_conv + + +def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale): + # 1x1 conv + pointwise_conv = conv_bn( + input=input, + filter_size=1, + num_filters=int(num_filters1 * scale), + stride=1, + num_groups=int(num_groups * scale), + padding=0) + + # 3x3 conv + normal_conv = conv_bn( + input=pointwise_conv, + filter_size=3, + num_filters=int(num_filters2 * scale), + stride=2, + num_groups=int(num_groups * scale), + padding=1) + return normal_conv + + +def mobile_net(num_classes, img, img_shape, scale=1.0): + # 300x300 + tmp = conv_bn(img, 3, int(32 * scale), 2, 1, 3) + # 150x150 + tmp = depthwise_separable(tmp, 32, 64, 32, 1, scale) + tmp = depthwise_separable(tmp, 64, 128, 64, 2, scale) + # 75x75 + tmp = depthwise_separable(tmp, 128, 128, 128, 1, scale) + tmp = depthwise_separable(tmp, 128, 256, 128, 2, scale) + # 38x38 + tmp = depthwise_separable(tmp, 256, 256, 256, 1, scale) + tmp = depthwise_separable(tmp, 256, 512, 256, 2, scale) + + # 19x19 + for i in range(5): + tmp = depthwise_separable(tmp, 512, 512, 512, 1, scale) + module11 = tmp + tmp = depthwise_separable(tmp, 512, 1024, 512, 2, scale) + + # 10x10 + module13 = depthwise_separable(tmp, 1024, 1024, 1024, 1, scale) + module14 = extra_block(module13, 256, 512, 1, 2, scale) + # 5x5 + module15 = extra_block(module14, 128, 256, 1, 2, scale) + # 3x3 + module16 = extra_block(module15, 128, 256, 1, 2, scale) + # 2x2 + module17 = extra_block(module16, 64, 128, 1, 2, scale) + + mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head( + inputs=[module11, module13, module14, module15, module16, module17], + image=img, + num_classes=num_classes, + min_ratio=20, + max_ratio=90, + min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0], + max_sizes=[[], 150.0, 195.0, 240.0, 285.0, 300.0], + aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]], + base_size=img_shape[2], + offset=0.5, + flip=True) + + return mbox_locs, mbox_confs, box, box_var diff --git a/object_detection/reader.py b/object_detection/reader.py new file mode 100644 index 00000000..4ccf3326 --- /dev/null +++ b/object_detection/reader.py @@ -0,0 +1,355 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import image_util +from paddle.utils.image_util import * +import random +from PIL import Image +from PIL import ImageDraw +import numpy as np +import xml.etree.ElementTree +import os +import time +import copy + + +class Settings(object): + def __init__(self, + dataset=None, + data_dir=None, + label_file=None, + resize_h=300, + resize_w=300, + mean_value=[127.5, 127.5, 127.5], + apply_distort=True, + apply_expand=True, + toy=0): + self._dataset = dataset + self._toy = toy + self._data_dir = data_dir + if dataset == "pascalvoc": + self._label_list = [] + label_fpath = os.path.join(data_dir, label_file) + for line in open(label_fpath): + self._label_list.append(line.strip()) + + self._apply_distort = apply_distort + self._apply_expand = apply_expand + self._resize_height = resize_h + self._resize_width = resize_w + self._img_mean = np.array(mean_value)[:, np.newaxis, + np.newaxis].astype('float32') + self._expand_prob = 0.5 + self._expand_max_ratio = 4 + self._hue_prob = 0.5 + self._hue_delta = 18 + self._contrast_prob = 0.5 + self._contrast_delta = 0.5 + self._saturation_prob = 0.5 + self._saturation_delta = 0.5 + self._brightness_prob = 0.5 + self._brightness_delta = 0.125 + + @property + def dataset(self): + return self._dataset + + @property + def toy(self): + return self._toy + + @property + def apply_distort(self): + return self._apply_expand + + @property + def apply_distort(self): + return self._apply_distort + + @property + def data_dir(self): + return self._data_dir + + @data_dir.setter + def data_dir(self, data_dir): + self._data_dir = data_dir + + @property + def label_list(self): + return self._label_list + + @property + def resize_h(self): + return self._resize_height + + @property + def resize_w(self): + return self._resize_width + + @property + def img_mean(self): + return self._img_mean + + +def preprocess(img, bbox_labels, mode, settings): + img_width, img_height = img.size + sampled_labels = bbox_labels + if mode == 'train': + if settings._apply_distort: + img = image_util.distort_image(img, settings) + if settings._apply_expand: + img, bbox_labels, img_width, img_height = image_util.expand_image( + img, bbox_labels, img_width, img_height, settings) + # sampling + batch_sampler = [] + # hard-code here + batch_sampler.append( + image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0)) + sampled_bbox = image_util.generate_batch_samples(batch_sampler, + bbox_labels) + + img = np.array(img) + if len(sampled_bbox) > 0: + idx = int(random.uniform(0, len(sampled_bbox))) + img, sampled_labels = image_util.crop_image( + img, bbox_labels, sampled_bbox[idx], img_width, img_height) + + img = Image.fromarray(img) + img = img.resize((settings.resize_w, settings.resize_h), Image.ANTIALIAS) + img = np.array(img) + + if mode == 'train': + mirror = int(random.uniform(0, 2)) + if mirror == 1: + img = img[:, ::-1, :] + for i in xrange(len(sampled_labels)): + tmp = sampled_labels[i][1] + sampled_labels[i][1] = 1 - sampled_labels[i][3] + sampled_labels[i][3] = 1 - tmp + # HWC to CHW + if len(img.shape) == 3: + img = np.swapaxes(img, 1, 2) + img = np.swapaxes(img, 1, 0) + # RBG to BGR + img = img[[2, 1, 0], :, :] + img = img.astype('float32') + img -= settings.img_mean + img = img * 0.007843 + return img, sampled_labels + + +def coco(settings, file_list, mode, shuffle): + # cocoapi + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + coco = COCO(file_list) + image_ids = coco.getImgIds() + images = coco.loadImgs(image_ids) + category_ids = coco.getCatIds() + category_names = [item['name'] for item in coco.loadCats(category_ids)] + + if not settings.toy == 0: + images = images[:settings.toy] if len( + images) > settings.toy else images + print("{} on {} with {} images".format(mode, settings.dataset, len( + images))) + + def reader(): + if mode == 'train' and shuffle: + random.shuffle(images) + for image in images: + image_name = image['file_name'] + image_path = os.path.join(settings.data_dir, image_name) + + im = Image.open(image_path) + if im.mode == 'L': + im = im.convert('RGB') + im_width, im_height = im.size + + # layout: category_id | xmin | ymin | xmax | ymax | iscrowd | + # origin_coco_bbox | segmentation | area | image_id | annotation_id + bbox_labels = [] + annIds = coco.getAnnIds(imgIds=image['id']) + anns = coco.loadAnns(annIds) + for ann in anns: + bbox_sample = [] + # start from 1, leave 0 to background + bbox_sample.append( + float(category_ids.index(ann['category_id'])) + 1) + bbox = ann['bbox'] + xmin, ymin, w, h = bbox + xmax = xmin + w + ymax = ymin + h + bbox_sample.append(float(xmin) / im_width) + bbox_sample.append(float(ymin) / im_height) + bbox_sample.append(float(xmax) / im_width) + bbox_sample.append(float(ymax) / im_height) + bbox_sample.append(float(ann['iscrowd'])) + bbox_labels.append(bbox_sample) + im, sample_labels = preprocess(im, bbox_labels, mode, settings) + sample_labels = np.array(sample_labels) + if len(sample_labels) == 0: continue + im = im.astype('float32') + boxes = sample_labels[:, 1:5] + lbls = sample_labels[:, 0].astype('int32') + difficults = sample_labels[:, -1].astype('int32') + yield im, boxes, lbls, difficults + + return reader + + +def pascalvoc(settings, file_list, mode, shuffle): + flist = open(file_list) + images = [line.strip() for line in flist] + if not settings.toy == 0: + images = images[:settings.toy] if len( + images) > settings.toy else images + print("{} on {} with {} images".format(mode, settings.dataset, len( + images))) + + def reader(): + if mode == 'train' and shuffle: + random.shuffle(images) + for image in images: + image_path, label_path = image.split() + image_path = os.path.join(settings.data_dir, image_path) + label_path = os.path.join(settings.data_dir, label_path) + + im = Image.open(image_path) + if im.mode == 'L': + im = im.convert('RGB') + im_width, im_height = im.size + + # layout: label | xmin | ymin | xmax | ymax | difficult + bbox_labels = [] + root = xml.etree.ElementTree.parse(label_path).getroot() + for object in root.findall('object'): + bbox_sample = [] + # start from 1 + bbox_sample.append( + float( + settings.label_list.index(object.find('name').text))) + bbox = object.find('bndbox') + difficult = float(object.find('difficult').text) + bbox_sample.append(float(bbox.find('xmin').text) / im_width) + bbox_sample.append(float(bbox.find('ymin').text) / im_height) + bbox_sample.append(float(bbox.find('xmax').text) / im_width) + bbox_sample.append(float(bbox.find('ymax').text) / im_height) + bbox_sample.append(difficult) + bbox_labels.append(bbox_sample) + im, sample_labels = preprocess(im, bbox_labels, mode, settings) + sample_labels = np.array(sample_labels) + if len(sample_labels) == 0: continue + im = im.astype('float32') + boxes = sample_labels[:, 1:5] + lbls = sample_labels[:, 0].astype('int32') + difficults = sample_labels[:, -1].astype('int32') + yield im, boxes, lbls, difficults + + return reader + + +def draw_bounding_box_on_image(image, + sample_labels, + image_name, + category_names, + color='red', + thickness=4, + with_text=True, + normalized=True): + image = Image.fromarray(image) + draw = ImageDraw.Draw(image) + im_width, im_height = image.size + if not normalized: + im_width, im_height = 1, 1 + for item in sample_labels: + label = item[0] + category_name = category_names[int(label)] + bbox = item[1:5] + xmin, ymin, xmax, ymax = bbox + (left, right, top, bottom) = (xmin * im_width, xmax * im_width, + ymin * im_height, ymax * im_height) + draw.line( + [(left, top), (left, bottom), (right, bottom), (right, top), + (left, top)], + width=thickness, + fill=color) + if with_text: + if image.mode == 'RGB': + draw.text((left, top), category_name, (255, 255, 0)) + image.save(image_name) + + +def train(settings, file_list, shuffle=True): + file_list = os.path.join(settings.data_dir, file_list) + if settings.dataset == 'coco': + train_settings = copy.copy(settings) + if '2014' in file_list: + sub_dir = "train2014" + elif '2017' in file_list: + sub_dir = "train2017" + train_settings.data_dir = os.path.join(settings.data_dir, sub_dir) + return coco(train_settings, file_list, 'train', shuffle) + else: + return pascalvoc(settings, file_list, 'train', shuffle) + + +def test(settings, file_list): + file_list = os.path.join(settings.data_dir, file_list) + if settings.dataset == 'coco': + test_settings = copy.copy(settings) + if '2014' in file_list: + sub_dir = "val2014" + elif '2017' in file_list: + sub_dir = "val2017" + test_settings.data_dir = os.path.join(settings.data_dir, sub_dir) + return coco(test_settings, file_list, 'test', False) + else: + return pascalvoc(settings, file_list, 'test', False) + + +def infer(settings, image_path): + def reader(): + im = Image.open(image_path) + if im.mode == 'L': + im = im.convert('RGB') + im_width, im_height = im.size + img = img.resize((settings.resize_w, settings.resize_h), + Image.ANTIALIAS) + img = np.array(img) + # HWC to CHW + if len(img.shape) == 3: + img = np.swapaxes(img, 1, 2) + img = np.swapaxes(img, 1, 0) + # RBG to BGR + img = img[[2, 1, 0], :, :] + img = img.astype('float32') + img -= settings.img_mean + img = img * 0.007843 + yield img + + return reader diff --git a/object_detection/run.xsh b/object_detection/run.xsh new file mode 100755 index 00000000..8783d96f --- /dev/null +++ b/object_detection/run.xsh @@ -0,0 +1,16 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +cudaid=${object_detection_cudaid:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +#if [ ! -d "data/pascalvoc" ];then +# mkdir -p data/pascalvoc +# ./download.sh +#fi +FLAGS_benchmark=true python train.py --batch_size=64 --num_passes=2 +cudaid=${object_detection_multi_cudaid:=0,1,2,3} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --batch_size=64 --num_passes=2 --gpu_card_num=4 + diff --git a/object_detection/train.py b/object_detection/train.py new file mode 100644 index 00000000..7846cbe8 --- /dev/null +++ b/object_detection/train.py @@ -0,0 +1,369 @@ +import os +import time +import numpy as np +import argparse +import functools +import shutil + +import paddle as paddle +import paddle.fluid as fluid +import reader +from mobilenet_ssd import mobile_net +from utility import add_arguments, print_arguments + +from continuous_evaluation import train_cost_kpi, train_speed_kpi, four_card_speed_kpi + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('learning_rate', float, 0.001, "Learning rate.") +add_arg('batch_size', int, 32, "Minibatch size.") +add_arg('num_passes', int, 120, "Epoch number.") +add_arg('iterations', int, 120, "mini batchs.") +add_arg('skip_batch_num', int, 5, "the num of minibatch to skip.") +add_arg('gpu_card_num', int, 1, "the num of gpu card.") +add_arg('parallel', bool, True, "Whether use parallel training.") +add_arg('use_gpu', bool, True, "Whether to use GPU or not.") +add_arg('use_nccl', bool, True, "Whether to use NCCL or not.") +add_arg('dataset', str, 'pascalvoc', "coco or pascalvoc.") +add_arg('model_save_dir', str, 'model', "The path to save model.") +add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.") +add_arg('apply_distort', bool, True, "Whether apply distort") +add_arg('apply_expand', bool, True, "Whether appley expand") +add_arg('ap_version', str, '11point', "11point or integral") +add_arg('resize_h', int, 300, "The resized image height.") +add_arg('resize_w', int, 300, "The resized image width.") +add_arg('mean_value_B', float, 127.5, "mean value for B channel which will be subtracted") #123.68 +add_arg('mean_value_G', float, 127.5, "mean value for G channel which will be subtracted") #116.78 +add_arg('mean_value_R', float, 127.5, "mean value for R channel which will be subtracted") #103.94 +add_arg('is_toy', int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample") +# yapf: enable + + +def parallel_do(args, + train_file_list, + val_file_list, + data_args, + learning_rate, + batch_size, + num_passes, + model_save_dir, + pretrained_model=None): + image_shape = [3, data_args.resize_h, data_args.resize_w] + if data_args.dataset == 'coco': + num_classes = 81 + elif data_args.dataset == 'pascalvoc': + num_classes = 21 + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + gt_box = fluid.layers.data( + name='gt_box', shape=[4], dtype='float32', lod_level=1) + gt_label = fluid.layers.data( + name='gt_label', shape=[1], dtype='int32', lod_level=1) + difficult = fluid.layers.data( + name='gt_difficult', shape=[1], dtype='int32', lod_level=1) + + if args.parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl) + with pd.do(): + image_ = pd.read_input(image) + gt_box_ = pd.read_input(gt_box) + gt_label_ = pd.read_input(gt_label) + difficult_ = pd.read_input(difficult) + locs, confs, box, box_var = mobile_net(num_classes, image_, + image_shape) + loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box, + box_var) + nmsed_out = fluid.layers.detection_output( + locs, confs, box, box_var, nms_threshold=0.45) + loss = fluid.layers.reduce_sum(loss) + pd.write_output(loss) + pd.write_output(nmsed_out) + + loss, nmsed_out = pd() + loss = fluid.layers.mean(loss) + else: + locs, confs, box, box_var = mobile_net(num_classes, image, image_shape) + nmsed_out = fluid.layers.detection_output( + locs, confs, box, box_var, nms_threshold=0.45) + loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, + box_var) + loss = fluid.layers.reduce_sum(loss) + + test_program = fluid.default_main_program().clone(for_test=True) + with fluid.program_guard(test_program): + map_eval = fluid.evaluator.DetectionMAP( + nmsed_out, + gt_label, + gt_box, + difficult, + num_classes, + overlap_threshold=0.5, + evaluate_difficult=False, + ap_version=args.ap_version) + + if data_args.dataset == 'coco': + # learning rate decay in 12, 19 pass, respectively + if '2014' in train_file_list: + boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19] + elif '2017' in train_file_list: + boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19] + elif data_args.dataset == 'pascalvoc': + boundaries = [40000, 60000] + values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25] + optimizer = fluid.optimizer.RMSProp( + learning_rate=fluid.layers.piecewise_decay(boundaries, values), + regularization=fluid.regularizer.L2Decay(0.00005), ) + + optimizer.minimize(loss) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if pretrained_model: + + def if_exist(var): + return os.path.exists(os.path.join(pretrained_model, var.name)) + + fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) + + train_reader = paddle.batch( + reader.train(data_args, train_file_list), batch_size=batch_size) + test_reader = paddle.batch( + reader.test(data_args, val_file_list), batch_size=batch_size) + feeder = fluid.DataFeeder( + place=place, feed_list=[image, gt_box, gt_label, difficult]) + + def test(pass_id): + _, accum_map = map_eval.get_map_var() + map_eval.reset(exe) + test_map = None + for data in test_reader(): + test_map = exe.run(test_program, + feed=feeder.feed(data), + fetch_list=[accum_map]) + print("Test {0}, map {1}".format(pass_id, test_map[0])) + + for pass_id in range(num_passes): + start_time = time.time() + prev_start_time = start_time + end_time = 0 + for batch_id, data in enumerate(train_reader()): + prev_start_time = start_time + start_time = time.time() + loss_v = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[loss]) + end_time = time.time() + if batch_id % 20 == 0: + print("Pass {0}, batch {1}, loss {2}, time {3}".format( + pass_id, batch_id, loss_v[0], + start_time - prev_start_time)) + test(pass_id) + + if pass_id % 10 == 0 or pass_id == num_passes - 1: + model_path = os.path.join(model_save_dir, str(pass_id)) + print 'save models to %s' % (model_path) + fluid.io.save_persistables(exe, model_path) + + +def parallel_exe(args, + train_file_list, + val_file_list, + data_args, + learning_rate, + batch_size, + num_passes, + model_save_dir='model', + pretrained_model=None): + image_shape = [3, data_args.resize_h, data_args.resize_w] + if data_args.dataset == 'coco': + num_classes = 81 + elif data_args.dataset == 'pascalvoc': + num_classes = 21 + + devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" + devices_num = len(devices.split(",")) + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + gt_box = fluid.layers.data( + name='gt_box', shape=[4], dtype='float32', lod_level=1) + gt_label = fluid.layers.data( + name='gt_label', shape=[1], dtype='int32', lod_level=1) + difficult = fluid.layers.data( + name='gt_difficult', shape=[1], dtype='int32', lod_level=1) + + locs, confs, box, box_var = mobile_net(num_classes, image, image_shape) + nmsed_out = fluid.layers.detection_output( + locs, confs, box, box_var, nms_threshold=0.45) + loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var) + loss = fluid.layers.reduce_sum(loss) + + test_program = fluid.default_main_program().clone(for_test=True) + with fluid.program_guard(test_program): + map_eval = fluid.evaluator.DetectionMAP( + nmsed_out, + gt_label, + gt_box, + difficult, + num_classes, + overlap_threshold=0.5, + evaluate_difficult=False, + ap_version=args.ap_version) + + if data_args.dataset == 'coco': + # learning rate decay in 12, 19 pass, respectively + if '2014' in train_file_list: + epocs = 82783 / batch_size + boundaries = [epocs * 12, epocs * 19] + elif '2017' in train_file_list: + epocs = 118287 / batch_size + boundaries = [epcos * 12, epocs * 19] + elif data_args.dataset == 'pascalvoc': + epocs = 19200 / batch_size + boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100] + values = [ + learning_rate, learning_rate * 0.5, learning_rate * 0.25, + learning_rate * 0.1, learning_rate * 0.01 + ] + optimizer = fluid.optimizer.RMSProp( + learning_rate=fluid.layers.piecewise_decay(boundaries, values), + regularization=fluid.regularizer.L2Decay(0.00005), ) + + optimizer.minimize(loss) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + fluid.default_startup_program.random_seed = 1000 + exe.run(fluid.default_startup_program()) + + if pretrained_model: + + def if_exist(var): + return os.path.exists(os.path.join(pretrained_model, var.name)) + + fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) + + if args.parallel: + train_exe = fluid.ParallelExecutor( + use_cuda=args.use_gpu, loss_name=loss.name) + + train_reader = paddle.batch( + reader.train(data_args, train_file_list), batch_size=batch_size) + test_reader = paddle.batch( + reader.test(data_args, val_file_list), batch_size=batch_size) + feeder = fluid.DataFeeder( + place=place, feed_list=[image, gt_box, gt_label, difficult]) + + def save_model(postfix): + model_path = os.path.join(model_save_dir, postfix) + if os.path.isdir(model_path): + shutil.rmtree(model_path) + print 'save models to %s' % (model_path) + fluid.io.save_persistables(exe, model_path) + + best_map = 0. + + def test(pass_id, best_map): + _, accum_map = map_eval.get_map_var() + map_eval.reset(exe) + test_map = None + for data in test_reader(): + test_map = exe.run(test_program, + feed=feeder.feed(data), + fetch_list=[accum_map]) + if test_map[0] > best_map: + best_map = test_map[0] + save_model('best_model') + print("Test {0}, map {1}".format(pass_id, test_map[0])) + + train_num = 0 + total_train_time = 0.0 + total_iters = 0 + for pass_id in range(num_passes): + every_pass_loss = [] + iter = 0 + pass_duration = 0.0 + for batch_id, data in enumerate(train_reader()): + batch_start = time.time() + if iter == args.iterations: + break + if len(data) < devices_num: continue + if args.parallel: + loss_v, = train_exe.run(fetch_list=[loss.name], + feed=feeder.feed(data)) + else: + loss_v, = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[loss]) + loss_v = np.mean(np.array(loss_v)) + if batch_id % 20 == 0: + print("Pass {0}, batch {1}, loss {2}, time {3}".format( + pass_id, batch_id, loss_v, time.time() - batch_start)) + if iter >= args.skip_batch_num or pass_id != 0: + batch_duration = time.time() - batch_start + pass_duration += batch_duration + train_num += len(data) + every_pass_loss.append(loss_v) + iter += 1 + total_iters += 1 + #test(pass_id, best_map) + total_train_time += pass_duration + print("Pass:%d, Loss:%f, Handle Images Duration: %f\n" % + (pass_id, np.mean(every_pass_loss), pass_duration)) + if pass_id == num_passes - 1: + examples_per_sec = train_num / total_train_time + train_cost_kpi.add_record(np.mean(every_pass_loss)) + train_speed_kpi.add_record( + np.array( + examples_per_sec, dtype='float')) + four_card_speed_kpi.add_record( + np.array( + examples_per_sec, dtype='float')) + if args.gpu_card_num == 1: + train_cost_kpi.persist() + train_speed_kpi.persist() + else: + four_card_speed_kpi.persist() + print("Best test map {0}".format(best_map)) + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + + data_dir = '/data/pascalvoc' + train_file_list = 'trainval.txt' + val_file_list = 'test.txt' + label_file = 'label_list' + model_save_dir = args.model_save_dir + if args.dataset == 'coco': + data_dir = './data/COCO17' + train_file_list = 'annotations/instances_train2017.json' + val_file_list = 'annotations/instances_val2017.json' + label_file = 'label_list' + + data_args = reader.Settings( + dataset=args.dataset, + data_dir=data_dir, + label_file=label_file, + apply_distort=args.apply_distort, + apply_expand=args.apply_expand, + resize_h=args.resize_h, + resize_w=args.resize_w, + mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R], + toy=args.is_toy) + #method = parallel_do + method = parallel_exe + method( + args, + train_file_list=train_file_list, + val_file_list=val_file_list, + data_args=data_args, + learning_rate=args.learning_rate, + batch_size=args.batch_size, + num_passes=args.num_passes, + model_save_dir=model_save_dir, + pretrained_model=args.pretrained_model) diff --git a/object_detection/utility.py b/object_detection/utility.py new file mode 100644 index 00000000..506e6007 --- /dev/null +++ b/object_detection/utility.py @@ -0,0 +1,62 @@ +"""Contains common utility functions.""" +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import distutils.util +import numpy as np +from paddle.fluid import core + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) diff --git a/resnet30/continuous_evaluation.py b/resnet30/continuous_evaluation.py deleted file mode 100644 index 283fcf48..00000000 --- a/resnet30/continuous_evaluation.py +++ /dev/null @@ -1,12 +0,0 @@ -import os -import sys -sys.path.append(os.environ['ceroot']) -from kpi import CostKpi, DurationKpi - -train_cost_kpi = CostKpi('train_cost', 0.01) -train_duration_kpi = DurationKpi('train_duration', 0.04) - -tracking_kpis = [ - train_cost_kpi, - train_duration_kpi, -] diff --git a/resnet30/latest_kpis/train_cost_factor.txt b/resnet30/latest_kpis/train_cost_factor.txt deleted file mode 100644 index 040e98ef..00000000 --- a/resnet30/latest_kpis/train_cost_factor.txt +++ /dev/null @@ -1,10 +0,0 @@ -[[100.0]] -[[100.0]] -[[100.0]] -[[100.0]] -[[100.0]] -[[100.0]] -[[100.0]] -[[100.0]] -[[100.0]] -[[100.0]] diff --git a/resnet30/latest_kpis/train_duration_factor.txt b/resnet30/latest_kpis/train_duration_factor.txt deleted file mode 100644 index 13bd6f8b..00000000 --- a/resnet30/latest_kpis/train_duration_factor.txt +++ /dev/null @@ -1,10 +0,0 @@ -[1000.0] -[1000.0] -[1000.0] -[1000.0] -[1000.0] -[1000.0] -[1000.0] -[1000.0] -[1000.0] -[1000.0] diff --git a/resnet50/continuous_evaluation.py b/resnet50/continuous_evaluation.py index 4fa0d179..b4eed6d2 100644 --- a/resnet50/continuous_evaluation.py +++ b/resnet50/continuous_evaluation.py @@ -3,12 +3,12 @@ sys.path.append(os.environ['ceroot']) from kpi import CostKpi, DurationKpi, AccKpi -cifar10_128_train_acc_kpi = AccKpi('cifar10_128_train_acc', 0.05, 0) -cifar10_128_train_speed_kpi = AccKpi('cifar10_128_train_speed', 0.05, 0) -cifar10_128_gpu_memory_kpi = DurationKpi('cifar10_128_gpu_memory', 0.05, 0) +cifar10_128_train_acc_kpi = AccKpi('cifar10_128_train_acc', 0.03, 0, actived=True) +cifar10_128_train_speed_kpi = AccKpi('cifar10_128_train_speed', 0.06, 0, actived=True) +cifar10_128_gpu_memory_kpi = DurationKpi('cifar10_128_gpu_memory', 0.1, 0, actived=True) -flowers_64_train_speed_kpi = AccKpi('flowers_64_train_speed', 0.05, 0) -flowers_64_gpu_memory_kpi = DurationKpi('flowers_64_gpu_memory', 0.05, 0) +flowers_64_train_speed_kpi = AccKpi('flowers_64_train_speed', 0.05, 0, actived=True) +flowers_64_gpu_memory_kpi = DurationKpi('flowers_64_gpu_memory', 0.1, 0, actived=True) tracking_kpis = [ cifar10_128_train_acc_kpi, diff --git a/resnet50/get_gpu_data.py b/resnet50/get_gpu_data.py index 83e57e34..1e391253 100644 --- a/resnet50/get_gpu_data.py +++ b/resnet50/get_gpu_data.py @@ -5,7 +5,6 @@ # Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved # ######################################################################## - """ File: get_gpu_data.py Author: paddle(paddle@baidu.com) @@ -14,7 +13,6 @@ import argparse from continuous_evaluation import tracking_kpis - parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '--batch_size', type=int, default=128, help="Batch size for training.") @@ -26,20 +24,21 @@ help='Optional dataset for benchmark.') args = parser.parse_args() + def save_gpu_data(): mem_list = [] - with open('mem.log', 'r') as f: - for i , data in enumerate(f.readlines()): + with open('memory.txt', 'r') as f: + for i, data in enumerate(f.readlines()): if i == 0: continue mem_list.append(int(data.split("\n")[0].split(" ")[0])) gpu_memory_factor = None - for kpi in tracking_kpis: + for kpi in tracking_kpis: if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size): gpu_memory_kpi = kpi gpu_memory_kpi.add_record(max(mem_list)) gpu_memory_kpi.persist() + if __name__ == "__main__": save_gpu_data() - diff --git a/resnet50/latest_kpis/cifar10_128_gpu_memory_factor.txt b/resnet50/latest_kpis/cifar10_128_gpu_memory_factor.txt index 466467aa..5ebe01c2 100644 --- a/resnet50/latest_kpis/cifar10_128_gpu_memory_factor.txt +++ b/resnet50/latest_kpis/cifar10_128_gpu_memory_factor.txt @@ -1 +1 @@ -[1508] \ No newline at end of file +[1394] diff --git a/resnet50/latest_kpis/cifar10_128_train_acc_factor.txt b/resnet50/latest_kpis/cifar10_128_train_acc_factor.txt index c276983e..83208824 100644 --- a/resnet50/latest_kpis/cifar10_128_train_acc_factor.txt +++ b/resnet50/latest_kpis/cifar10_128_train_acc_factor.txt @@ -1 +1 @@ -[0.99755859375] \ No newline at end of file +[0.93755859375] diff --git a/resnet50/latest_kpis/cifar10_128_train_speed_factor.txt b/resnet50/latest_kpis/cifar10_128_train_speed_factor.txt index c4fe04e3..f37998fd 100644 --- a/resnet50/latest_kpis/cifar10_128_train_speed_factor.txt +++ b/resnet50/latest_kpis/cifar10_128_train_speed_factor.txt @@ -1 +1 @@ -[404.4730529785156] \ No newline at end of file +[738.095703125] diff --git a/resnet50/latest_kpis/flowers_64_gpu_memory_factor.txt b/resnet50/latest_kpis/flowers_64_gpu_memory_factor.txt index 24c95b88..2799deaf 100644 --- a/resnet50/latest_kpis/flowers_64_gpu_memory_factor.txt +++ b/resnet50/latest_kpis/flowers_64_gpu_memory_factor.txt @@ -1 +1 @@ -[11014] \ No newline at end of file +[10352] diff --git a/resnet50/latest_kpis/flowers_64_train_speed_factor.txt b/resnet50/latest_kpis/flowers_64_train_speed_factor.txt index 8585524f..4938eeb8 100644 --- a/resnet50/latest_kpis/flowers_64_train_speed_factor.txt +++ b/resnet50/latest_kpis/flowers_64_train_speed_factor.txt @@ -1 +1 @@ -[78.7945785522461] \ No newline at end of file +[106.87747192382812] diff --git a/resnet50/model.py b/resnet50/model.py index 91977d26..f4e7beea 100644 --- a/resnet50/model.py +++ b/resnet50/model.py @@ -7,6 +7,7 @@ import numpy as np import time import commands +import subprocess import threading import cProfile @@ -90,8 +91,8 @@ def parse_args(): def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] - and vars(args)['device'] == 'GPU') + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') print('----------- Configuration Arguments -----------') for arg, value in sorted(vars(args).iteritems()): print('%s: %s' % (arg, value)) @@ -282,14 +283,15 @@ def test(exe): if iter == args.iterations: break if not args.use_fake_data: - image = np.array( - map(lambda x: x[0].reshape(dshape), data)).astype('float32') + image = np.array(map(lambda x: x[0].reshape(dshape), + data)).astype('float32') label = np.array(map(lambda x: x[1], data)).astype('int64') label = label.reshape([-1, 1]) loss, acc, weight = exe.run( - fluid.default_main_program(), feed={ - 'data': image, 'label': label}, fetch_list=[ - avg_cost, batch_acc, batch_size_tensor]) + fluid.default_main_program(), + feed={'data': image, + 'label': label}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor]) accuracy.add(value=acc, weight=weight) if iter >= args.skip_batch_num or pass_id != 0: batch_duration = time.time() - batch_start @@ -305,8 +307,9 @@ def test(exe): pass_train_acc = accuracy.eval() pass_test_acc = test(exe) print( - "Pass:%d, Loss:%f, Train Accuray:%f, Test Accuray:%f, Handle Images Duration: %f\n" % - (pass_id, np.mean(every_pass_loss), pass_train_acc, pass_test_acc, pass_duration)) + "Pass:%d, Loss:%f, Train Accuray:%f, Test Accuray:%f, Handle Images Duration: %f\n" + % (pass_id, np.mean(every_pass_loss), pass_train_acc, + pass_test_acc, pass_duration)) if pass_id == args.pass_num - 1 and args.data_set == 'cifar10': train_acc_kpi.add_record(np.array(pass_train_acc, dtype='float32')) train_acc_kpi.persist() @@ -317,9 +320,8 @@ def test(exe): train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32')) train_speed_kpi.persist() - print( - '\nTotal examples: %d, total time: %.5f' % - (im_num, total_train_time)) + print('\nTotal examples: %d, total time: %.5f' % + (im_num, total_train_time)) print('%.5f examples/sec, %.5f sec/batch \n' % (examples_per_sec, sec_per_batch)) @@ -332,23 +334,26 @@ def test(exe): print(s.getvalue()) -def collect_gpu_memory_data(mem_list): +def collect_gpu_memory_data(alive): """ collect the GPU memory data """ - while(True): - command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv" % args.gpu_id - status, output = commands.getstatusoutput(command) - if status != 0: - print('Get GPU memory data error') - else: - mem_list.append(int(output.split('\n')[1].split(' ')[0])) + global is_alive + status, output = commands.getstatusoutput('rm -rf memory.txt') + if status == 0: + print('del memory.txt') + command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id + p = subprocess.Popen(command, shell=True) + if p.pid < 0: + print('Get GPU memory data error') + while (is_alive): time.sleep(1) + p.kill() def save_gpu_data(mem_list): gpu_memory_kpi = None - for kpi in tracking_kpis: + for kpi in tracking_kpis: if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size): gpu_memory_kpi = kpi gpu_memory_kpi.add_record(max(mem_list)) @@ -362,12 +367,13 @@ def save_gpu_data(mem_list): } args = parse_args() print_arguments(args) + global is_alive + is_alive = True if args.data_format == 'NHWC': raise ValueError('Only support NCHW data_format now.') - mem_data_list = [] if args.device == 'GPU': collect_memory_thread = threading.Thread( - target=collect_gpu_memory_data, args=(mem_data_list,)) + target=collect_gpu_memory_data, args=(is_alive, )) collect_memory_thread.setDaemon(True) collect_memory_thread.start() if args.use_nvprof and args.device == 'GPU': @@ -375,4 +381,4 @@ def save_gpu_data(mem_list): run_benchmark(model_map[args.model], args) else: run_benchmark(model_map[args.model], args) - save_gpu_data(mem_data_list) + is_alive = False diff --git a/resnet50/run.xsh b/resnet50/run.xsh index 81f7847f..04764927 100755 --- a/resnet50/run.xsh +++ b/resnet50/run.xsh @@ -7,6 +7,12 @@ export CUDA_VISIBLE_DEVICES=$cudaid # cifar10 128 FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=128 --data_set=cifar10 --model=resnet_cifar10 --pass_num=30 --gpu_id=$cudaid +python get_gpu_data.py --batch_size=128 --data_set=cifar10 #flowers 64 FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=64 --data_set=flowers --model=resnet_imagenet --pass_num=3 --gpu_id=$cudaid +python get_gpu_data.py --batch_size=64 --data_set=flowers +for pid in $(ps -ef | grep nvidia-smi | grep -v grep | cut -c 9-15); do + echo $pid + kill -9 $pid +done \ No newline at end of file diff --git a/seq2seq/continuous_evaluation.py b/seq2seq/continuous_evaluation.py new file mode 100644 index 00000000..191f2c63 --- /dev/null +++ b/seq2seq/continuous_evaluation.py @@ -0,0 +1,17 @@ +""" +continuous_evaluation.py +""" +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import AccKpi +from kpi import CostKpi +from kpi import DurationKpi + +wmb_128_train_speed_kpi = AccKpi('wmb_128_train_speed', 0.2, 0) +wmb_128_gpu_memory_kpi = DurationKpi('wmb_128_gpu_memory', 0.2, 0) + +tracking_kpis = [ + wmb_128_train_speed_kpi, + wmb_128_gpu_memory_kpi, +] diff --git a/seq2seq/get_gpu_data.py b/seq2seq/get_gpu_data.py new file mode 100644 index 00000000..c852351d --- /dev/null +++ b/seq2seq/get_gpu_data.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +######################################################################## +# +# Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved +# +######################################################################## +""" +File: get_gpu_data.py +Author: paddle(paddle@baidu.com) +Date: 2018/04/02 15:57:14 +""" +import argparse +from continuous_evaluation import tracking_kpis + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--data_set', + type=str, + default='wmb', + help='Optional dataset for benchmark.') +args = parser.parse_args() + + +def save_gpu_data(): + mem_list = [] + with open('memory.txt', 'r') as f: + for i, data in enumerate(f.readlines()): + if i == 0: + continue + mem_list.append(int(data.split("\n")[0].split(" ")[0])) + gpu_memory_factor = None + for kpi in tracking_kpis: + if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size): + gpu_memory_kpi = kpi + gpu_memory_kpi.add_record(max(mem_list)) + gpu_memory_kpi.persist() + + +if __name__ == "__main__": + save_gpu_data() diff --git a/seq2seq/latest_kpis/wmb_128_gpu_memory_factor.txt b/seq2seq/latest_kpis/wmb_128_gpu_memory_factor.txt new file mode 100644 index 00000000..e5d1e87f --- /dev/null +++ b/seq2seq/latest_kpis/wmb_128_gpu_memory_factor.txt @@ -0,0 +1 @@ +[6976] diff --git a/seq2seq/latest_kpis/wmb_128_train_speed_factor.txt b/seq2seq/latest_kpis/wmb_128_train_speed_factor.txt new file mode 100644 index 00000000..f845312b --- /dev/null +++ b/seq2seq/latest_kpis/wmb_128_train_speed_factor.txt @@ -0,0 +1 @@ +[4430.63330078125] diff --git a/seq2seq/model.py b/seq2seq/model.py new file mode 100644 index 00000000..7c0db4e0 --- /dev/null +++ b/seq2seq/model.py @@ -0,0 +1,421 @@ +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import distutils.util +import commands +import subprocess +import threading + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor + +from continuous_evaluation import tracking_kpis + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=16, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=2, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument( + "--gpu_id", + type=int, + default=3, + help="The GPU Card Id. (default: %(default)d)") +parser.add_argument( + "--max_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") +parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + def lstm_decoder_with_attention(target_embedding, encoder_vec, + encoder_proj, decoder_boot, decoder_size): + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + concated = fluid.layers.concat( + input=[encoder_proj, decoder_state_expand], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + act='tanh', + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) + weigths_reshape = fluid.layers.reshape( + x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + rnn = fluid.layers.DynamicRNN() + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + with rnn.block(): + current_word = rnn.step_input(target_embedding) + encoder_vec = rnn.static_input(encoder_vec) + encoder_proj = rnn.static_input(encoder_proj) + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) + cell_mem = rnn.memory(init=cell_init) + context = simple_attention(encoder_vec, encoder_proj, hidden_mem) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, + decoder_size) + rnn.update_memory(hidden_mem, h) + rnn.update_memory(cell_mem, c) + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + rnn.output(out) + return rnn() + + if not is_generating: + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, + encoded_proj, decoder_boot, + decoder_size) + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + + return avg_cost, feeding_list + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray + + +def train(): + avg_cost, feeding_list = seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = Executor(place) + exe.run(framework.default_startup_program()) + + def do_validation(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] + + fetch_outs = exe.run(inference_program, + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] + count += 1 + + return total_loss / count + + train_acc_kpi = None + for kpi in tracking_kpis: + if kpi.name == 'wmb_%s_train_acc' % (args.batch_size): + train_acc_kpi = kpi + train_speed_kpi = None + for kpi in tracking_kpis: + if kpi.name == 'wmb_%s_train_speed' % (args.batch_size): + train_speed_kpi = kpi + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in xrange(args.pass_num): + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_batch_generator()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + num_samples += word_num + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + num_samples += word_num + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost]) + + iters += 1 + loss = np.array(fetch_outs[0]) + print( + "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss) + ) # The accuracy is the accumulation of batches, but not the current batch. + + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + # evaluation + train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32')) + if args.with_test: + test_loss = do_validation() + break + train_speed_kpi.persist() + + +def infer(): + pass + + +def print_arguments(args): + print('----------- seq2seq Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def collect_gpu_memory_data(alive): + """ + collect the GPU memory data + """ + global is_alive + status, output = commands.getstatusoutput('rm -rf memory.txt') + if status == 0: + print('del memory.txt') + command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id + p = subprocess.Popen(command, shell=True) + if p.pid < 0: + print('Get GPU memory data error') + while (is_alive): + time.sleep(1) + p.kill() + + +def save_gpu_data(mem_list): + gpu_memory_kpi = None + for kpi in tracking_kpis: + if kpi.name == 'wmb_%s_gpu_memory' % (args.batch_size): + gpu_memory_kpi = kpi + gpu_memory_kpi.add_record(max(mem_list)) + gpu_memory_kpi.persist() + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + global is_alive + is_alive = True + collect_memory_thread = threading.Thread( + target=collect_gpu_memory_data, args=(is_alive, )) + collect_memory_thread.setDaemon(True) + collect_memory_thread.start() + if args.infer_only: + infer() + else: + train() + is_alive = False diff --git a/seq2seq/run.xsh b/seq2seq/run.xsh new file mode 100755 index 00000000..2e315c0c --- /dev/null +++ b/seq2seq/run.xsh @@ -0,0 +1,14 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +cudaid=${seq2seq_cudaid:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +#imdb 128 +FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=128 --iterations=50 --gpu_id=$cudaid +python get_gpu_data.py --batch_size=128 --data_set=wmb +for pid in $(ps -ef | grep nvidia-smi | grep -v grep | cut -c 9-15); do + echo $pid + kill -9 $pid +done diff --git a/sequence_tagging_for_ner/README.md b/sequence_tagging_for_ner/README.md new file mode 100644 index 00000000..1f634da4 --- /dev/null +++ b/sequence_tagging_for_ner/README.md @@ -0,0 +1,120 @@ +# 命名实体识别 + +以下是本例的简要目录结构及说明: + +```text +. +├── data # 存储运行本例所依赖的数据,从外部获取 +├── network_conf.py # 模型定义 +├── reader.py # 数据读取接口, 从外部获取 +├── README.md # 文档 +├── train.py # 训练脚本 +├── infer.py # 预测脚本 +├── utils.py # 定义通用的函数, 从外部获取 +└── utils_extend.py # 对utils.py的拓展 +``` + + +## 简介,模型详解 + +在PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中对于命名实体识别任务有较详细的介绍,在本例中不再重复介绍。 +在模型上,我们沿用了v2版本的模型结构,唯一区别是我们使用LSTM代替原始的RNN。 + +## 数据获取 + +请参考PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md) 一节中数据获取方式,将该例中的data文件夹拷贝至本例目录下,运行其中的download.sh脚本获取训练和测试数据。 + +## 通用脚本获取 + +请将PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中提供的用于数据读取的文件[reader.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/reader.py)以及包含字典导入等通用功能的文件[utils.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/utils.py)复制到本目录下。本例将会使用到这两个脚本。 + +## 训练 + +1. 运行 `sh data/download.sh` +2. 修改 `train.py` 的 `main` 函数,指定数据路径 + + ```python + main( + train_data_file="data/train", + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + emb_file="data/wordVectors.txt", + model_save_dir="models", + num_passes=1000, + use_gpu=False, + parallel=False) + ``` + +3. 运行命令 `python train.py` ,**需要注意:直接运行使用的是示例数据,请替换真实的标记数据。** + + ```text + Pass 127, Batch 9525, Cost 4.0867705, Precision 0.3954984, Recall 0.37846154, F1_score0.38679245 + Pass 127, Batch 9530, Cost 3.137265, Precision 0.42971888, Recall 0.38351256, F1_score0.405303 + Pass 127, Batch 9535, Cost 3.6240938, Precision 0.4272152, Recall 0.41795665, F1_score0.4225352 + Pass 127, Batch 9540, Cost 3.5352352, Precision 0.48464164, Recall 0.4536741, F1_score0.46864685 + Pass 127, Batch 9545, Cost 4.1130385, Precision 0.40131578, Recall 0.3836478, F1_score0.39228293 + Pass 127, Batch 9550, Cost 3.6826708, Precision 0.43333334, Recall 0.43730888, F1_score0.43531203 + Pass 127, Batch 9555, Cost 3.6363933, Precision 0.42424244, Recall 0.3962264, F1_score0.4097561 + Pass 127, Batch 9560, Cost 3.6101768, Precision 0.51363635, Recall 0.353125, F1_score0.41851854 + Pass 127, Batch 9565, Cost 3.5935276, Precision 0.5152439, Recall 0.5, F1_score0.5075075 + Pass 127, Batch 9570, Cost 3.4987144, Precision 0.5, Recall 0.4330218, F1_score0.46410686 + Pass 127, Batch 9575, Cost 3.4659843, Precision 0.39864865, Recall 0.38064516, F1_score0.38943896 + Pass 127, Batch 9580, Cost 3.1702557, Precision 0.5, Recall 0.4490446, F1_score0.47315437 + Pass 127, Batch 9585, Cost 3.1587276, Precision 0.49377593, Recall 0.4089347, F1_score0.4473684 + Pass 127, Batch 9590, Cost 3.5043538, Precision 0.4556962, Recall 0.4600639, F1_score0.45786962 + Pass 127, Batch 9595, Cost 2.981989, Precision 0.44981414, Recall 0.45149255, F1_score0.4506518 + [TrainSet] pass_id:127 pass_precision:[0.46023396] pass_recall:[0.43197003] pass_f1_score:[0.44565433] + [TestSet] pass_id:127 pass_precision:[0.4708409] pass_recall:[0.47971722] pass_f1_score:[0.4752376] + ``` +## 预测 +1. 修改 [infer.py](./infer.py) 的 `infer` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下: + + ```python + infer( + model_path="models/params_pass_0", + batch_size=6, + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + use_gpu=False + ) + ``` + +2. 在终端运行 `python infer.py`,开始测试,会看到如下预测结果(以下为训练70个pass所得模型的部分预测结果): + + ```text + leicestershire B-ORG B-LOC + extended O O + their O O + first O O + innings O O + by O O + DGDG O O + runs O O + before O O + being O O + bowled O O + out O O + for O O + 296 O O + with O O + england B-LOC B-LOC + discard O O + andy B-PER B-PER + caddick I-PER I-PER + taking O O + three O O + for O O + DGDG O O + . O O + ``` + + 输出分为三列,以“\t” 分隔,第一列是输入的词语,第二列是标准结果,第三列为生成的标记结果。多条输入序列之间以空行分隔。 + +## 结果示例 + +

+
+图1. 学习曲线, 横轴表示训练轮数,纵轴表示F1值 +

diff --git a/sequence_tagging_for_ner/continuous_evaluation.py b/sequence_tagging_for_ner/continuous_evaluation.py new file mode 100644 index 00000000..e8e4ccd0 --- /dev/null +++ b/sequence_tagging_for_ner/continuous_evaluation.py @@ -0,0 +1,20 @@ +""" +continuous_evaluation.py +""" +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import AccKpi +from kpi import DurationKpi + +train_acc_kpi = AccKpi('train_acc', 0.2, 0) +pass_duration_kpi = DurationKpi('pass_duration', 0.02, 0, actived=True) +train_acc_kpi_card4 = AccKpi('train_acc_card4', 0.2, 0) +pass_duration_kpi_card4 = DurationKpi('pass_duration_card4', 0.02, 0, actived=True) + +tracking_kpis = [ + train_acc_kpi, + pass_duration_kpi, + train_acc_kpi_card4, + pass_duration_kpi_card4, +] diff --git a/sequence_tagging_for_ner/data/target.txt b/sequence_tagging_for_ner/data/target.txt new file mode 100644 index 00000000..e0fa4d8f --- /dev/null +++ b/sequence_tagging_for_ner/data/target.txt @@ -0,0 +1,9 @@ +B-LOC +I-LOC +B-MISC +I-MISC +B-ORG +I-ORG +B-PER +I-PER +O diff --git a/sequence_tagging_for_ner/data/test b/sequence_tagging_for_ner/data/test new file mode 100644 index 00000000..66163e1a --- /dev/null +++ b/sequence_tagging_for_ner/data/test @@ -0,0 +1,128 @@ +CRICKET NNP I-NP O +- : O O +LEICESTERSHIRE NNP I-NP I-ORG +TAKE NNP I-NP O +OVER IN I-PP O +AT NNP I-NP O +TOP NNP I-NP O +AFTER NNP I-NP O +INNINGS NNP I-NP O +VICTORY NN I-NP O +. . O O + +LONDON NNP I-NP I-LOC +1996-08-30 CD I-NP O + +West NNP I-NP I-MISC +Indian NNP I-NP I-MISC +all-rounder NN I-NP O +Phil NNP I-NP I-PER +Simmons NNP I-NP I-PER +took VBD I-VP O +four CD I-NP O +for IN I-PP O +38 CD I-NP O +on IN I-PP O +Friday NNP I-NP O +as IN I-PP O +Leicestershire NNP I-NP I-ORG +beat VBD I-VP O +Somerset NNP I-NP I-ORG +by IN I-PP O +an DT I-NP O +innings NN I-NP O +and CC O O +39 CD I-NP O +runs NNS I-NP O +in IN I-PP O +two CD I-NP O +days NNS I-NP O +to TO I-VP O +take VB I-VP O +over IN I-PP O +at IN B-PP O +the DT I-NP O +head NN I-NP O +of IN I-PP O +the DT I-NP O +county NN I-NP O +championship NN I-NP O +. . O O + +Their PRP$ I-NP O +stay NN I-NP O +on IN I-PP O +top NN I-NP O +, , O O +though RB I-ADVP O +, , O O +may MD I-VP O +be VB I-VP O +short-lived JJ I-ADJP O +as IN I-PP O +title NN I-NP O +rivals NNS I-NP O +Essex NNP I-NP I-ORG +, , O O +Derbyshire NNP I-NP I-ORG +and CC I-NP O +Surrey NNP I-NP I-ORG +all DT O O +closed VBD I-VP O +in RP I-PRT O +on IN I-PP O +victory NN I-NP O +while IN I-SBAR O +Kent NNP I-NP I-ORG +made VBD I-VP O +up RP I-PRT O +for IN I-PP O +lost VBN I-NP O +time NN I-NP O +in IN I-PP O +their PRP$ I-NP O +rain-affected JJ I-NP O +match NN I-NP O +against IN I-PP O +Nottinghamshire NNP I-NP I-ORG +. . O O + +After IN I-PP O +bowling VBG I-NP O +Somerset NNP I-NP I-ORG +out RP I-PRT O +for IN I-PP O +83 CD I-NP O +on IN I-PP O +the DT I-NP O +opening NN I-NP O +morning NN I-NP O +at IN I-PP O +Grace NNP I-NP I-LOC +Road NNP I-NP I-LOC +, , O O +Leicestershire NNP I-NP I-ORG +extended VBD I-VP O +their PRP$ I-NP O +first JJ I-NP O +innings NN I-NP O +by IN I-PP O +94 CD I-NP O +runs VBZ I-VP O +before IN I-PP O +being VBG I-VP O +bowled VBD I-VP O +out RP I-PRT O +for IN I-PP O +296 CD I-NP O +with IN I-PP O +England NNP I-NP I-LOC +discard VBP I-VP O +Andy NNP I-NP I-PER +Caddick NNP I-NP I-PER +taking VBG I-VP O +three CD I-NP O +for IN I-PP O +83 CD I-NP O +. . O O + diff --git a/sequence_tagging_for_ner/data/train b/sequence_tagging_for_ner/data/train new file mode 100644 index 00000000..cbf3e678 --- /dev/null +++ b/sequence_tagging_for_ner/data/train @@ -0,0 +1,139 @@ +EU NNP I-NP I-ORG +rejects VBZ I-VP O +German JJ I-NP I-MISC +call NN I-NP O +to TO I-VP O +boycott VB I-VP O +British JJ I-NP I-MISC +lamb NN I-NP O +. . O O + +Peter NNP I-NP I-PER +Blackburn NNP I-NP I-PER + +BRUSSELS NNP I-NP I-LOC +1996-08-22 CD I-NP O + +The DT I-NP O +European NNP I-NP I-ORG +Commission NNP I-NP I-ORG +said VBD I-VP O +on IN I-PP O +Thursday NNP I-NP O +it PRP B-NP O +disagreed VBD I-VP O +with IN I-PP O +German JJ I-NP I-MISC +advice NN I-NP O +to TO I-PP O +consumers NNS I-NP O +to TO I-VP O +shun VB I-VP O +British JJ I-NP I-MISC +lamb NN I-NP O +until IN I-SBAR O +scientists NNS I-NP O +determine VBP I-VP O +whether IN I-SBAR O +mad JJ I-NP O +cow NN I-NP O +disease NN I-NP O +can MD I-VP O +be VB I-VP O +transmitted VBN I-VP O +to TO I-PP O +sheep NN I-NP O +. . O O + +Germany NNP I-NP I-LOC +'s POS B-NP O +representative NN I-NP O +to TO I-PP O +the DT I-NP O +European NNP I-NP I-ORG +Union NNP I-NP I-ORG +'s POS B-NP O +veterinary JJ I-NP O +committee NN I-NP O +Werner NNP I-NP I-PER +Zwingmann NNP I-NP I-PER +said VBD I-VP O +on IN I-PP O +Wednesday NNP I-NP O +consumers NNS I-NP O +should MD I-VP O +buy VB I-VP O +sheepmeat NN I-NP O +from IN I-PP O +countries NNS I-NP O +other JJ I-ADJP O +than IN I-PP O +Britain NNP I-NP I-LOC +until IN I-SBAR O +the DT I-NP O +scientific JJ I-NP O +advice NN I-NP O +was VBD I-VP O +clearer JJR I-ADJP O +. . O O + +" " O O +We PRP I-NP O +do VBP I-VP O +n't RB I-VP O +support VB I-VP O +any DT I-NP O +such JJ I-NP O +recommendation NN I-NP O +because IN I-SBAR O +we PRP I-NP O +do VBP I-VP O +n't RB I-VP O +see VB I-VP O +any DT I-NP O +grounds NNS I-NP O +for IN I-PP O +it PRP I-NP O +, , O O +" " O O +the DT I-NP O +Commission NNP I-NP I-ORG +'s POS B-NP O +chief JJ I-NP O +spokesman NN I-NP O +Nikolaus NNP I-NP I-PER +van NNP I-NP I-PER +der FW I-NP I-PER +Pas NNP I-NP I-PER +told VBD I-VP O +a DT I-NP O +news NN I-NP O +briefing NN I-NP O +. . O O + +He PRP I-NP O +said VBD I-VP O +further JJ I-NP O +scientific JJ I-NP O +study NN I-NP O +was VBD I-VP O +required VBN I-VP O +and CC O O +if IN I-SBAR O +it PRP I-NP O +was VBD I-VP O +found VBN I-VP O +that IN I-SBAR O +action NN I-NP O +was VBD I-VP O +needed VBN I-VP O +it PRP I-NP O +should MD I-VP O +be VB I-VP O +taken VBN I-VP O +by IN I-PP O +the DT I-NP O +European NNP I-NP I-ORG +Union NNP I-NP I-ORG +. . O O + diff --git a/sequence_tagging_for_ner/download.sh b/sequence_tagging_for_ner/download.sh new file mode 100644 index 00000000..861f943e --- /dev/null +++ b/sequence_tagging_for_ner/download.sh @@ -0,0 +1,15 @@ +if [ -f assignment2.zip ]; then + echo "data exist" +else + wget http://cs224d.stanford.edu/assignment2/assignment2.zip +fi + +if [ $? -eq 0 ];then + unzip assignment2.zip + cp assignment2_release/data/ner/wordVectors.txt ./data + cp assignment2_release/data/ner/vocab.txt ./data + rm -rf assignment2.zip assignment2_release +else + echo "download data error!" >> /dev/stderr + exit 1 +fi diff --git a/sequence_tagging_for_ner/imgs/convergence_curve.png b/sequence_tagging_for_ner/imgs/convergence_curve.png new file mode 100644 index 00000000..6b862b75 Binary files /dev/null and b/sequence_tagging_for_ner/imgs/convergence_curve.png differ diff --git a/sequence_tagging_for_ner/infer.py b/sequence_tagging_for_ner/infer.py new file mode 100644 index 00000000..2d0bd949 --- /dev/null +++ b/sequence_tagging_for_ner/infer.py @@ -0,0 +1,71 @@ +import numpy as np + +import paddle.fluid as fluid +import paddle.v2 as paddle + +from network_conf import ner_net +import reader +from utils import load_dict, load_reverse_dict +from utils_extend import to_lodtensor + + +def infer(model_path, batch_size, test_data_file, vocab_file, target_file, + use_gpu): + """ + use the model under model_path to predict the test data, the result will be printed on the screen + + return nothing + """ + word_dict = load_dict(vocab_file) + word_reverse_dict = load_reverse_dict(vocab_file) + + label_dict = load_dict(target_file) + label_reverse_dict = load_reverse_dict(target_file) + + test_data = paddle.batch( + reader.data_reader(test_data_file, word_dict, label_dict), + batch_size=batch_size) + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) + for data in test_data(): + word = to_lodtensor(map(lambda x: x[0], data), place) + mark = to_lodtensor(map(lambda x: x[1], data), place) + target = to_lodtensor(map(lambda x: x[2], data), place) + crf_decode = exe.run( + inference_program, + feed={"word": word, + "mark": mark, + "target": target}, + fetch_list=fetch_targets, + return_numpy=False) + lod_info = (crf_decode[0].lod())[0] + np_data = np.array(crf_decode[0]) + assert len(data) == len(lod_info) - 1 + for sen_index in xrange(len(data)): + assert len(data[sen_index][0]) == lod_info[ + sen_index + 1] - lod_info[sen_index] + word_index = 0 + for tag_index in xrange(lod_info[sen_index], + lod_info[sen_index + 1]): + word = word_reverse_dict[data[sen_index][0][word_index]] + gold_tag = label_reverse_dict[data[sen_index][2][ + word_index]] + tag = label_reverse_dict[np_data[tag_index][0]] + print word + "\t" + gold_tag + "\t" + tag + word_index += 1 + print "" + + +if __name__ == "__main__": + infer( + model_path="models/params_pass_0", + batch_size=6, + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + use_gpu=False) diff --git a/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt b/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt new file mode 100644 index 00000000..bbcc1bf4 --- /dev/null +++ b/sequence_tagging_for_ner/latest_kpis/pass_duration_card4_factor.txt @@ -0,0 +1 @@ +[0.04497942033021347] \ No newline at end of file diff --git a/sequence_tagging_for_ner/latest_kpis/pass_duration_factor.txt b/sequence_tagging_for_ner/latest_kpis/pass_duration_factor.txt new file mode 100644 index 00000000..683e1d69 --- /dev/null +++ b/sequence_tagging_for_ner/latest_kpis/pass_duration_factor.txt @@ -0,0 +1 @@ +[0.021749680643496307] diff --git a/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt b/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt new file mode 100644 index 00000000..e7a19a6e --- /dev/null +++ b/sequence_tagging_for_ner/latest_kpis/train_acc_card4_factor.txt @@ -0,0 +1 @@ +[1.0] \ No newline at end of file diff --git a/sequence_tagging_for_ner/latest_kpis/train_acc_factor.txt b/sequence_tagging_for_ner/latest_kpis/train_acc_factor.txt new file mode 100644 index 00000000..0ea64a68 --- /dev/null +++ b/sequence_tagging_for_ner/latest_kpis/train_acc_factor.txt @@ -0,0 +1 @@ +[1.0] diff --git a/sequence_tagging_for_ner/network_conf.py b/sequence_tagging_for_ner/network_conf.py new file mode 100644 index 00000000..3611d7b7 --- /dev/null +++ b/sequence_tagging_for_ner/network_conf.py @@ -0,0 +1,130 @@ +import math + +import paddle.fluid as fluid +from paddle.fluid.initializer import NormalInitializer + +from utils import logger, load_dict, get_embedding + + +def ner_net(word_dict_len, label_dict_len, parallel, stack_num=2): + mark_dict_len = 2 + word_dim = 50 + mark_dim = 5 + hidden_dim = 300 + IS_SPARSE = True + embedding_name = 'emb' + + def _net_conf(word, mark, target): + word_embedding = fluid.layers.embedding( + input=word, + size=[word_dict_len, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) + + mark_embedding = fluid.layers.embedding( + input=mark, + size=[mark_dict_len, mark_dim], + dtype='float32', + is_sparse=IS_SPARSE) + + word_caps_vector = fluid.layers.concat( + input=[word_embedding, mark_embedding], axis=1) + mix_hidden_lr = 1 + + rnn_para_attr = fluid.ParamAttr( + initializer=NormalInitializer( + loc=0.0, scale=0.0), + learning_rate=mix_hidden_lr) + hidden_para_attr = fluid.ParamAttr( + initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)), + learning_rate=mix_hidden_lr) + + hidden = fluid.layers.fc( + input=word_caps_vector, + name="__hidden00__", + size=hidden_dim, + act="tanh", + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))), + param_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)))) + fea = [] + for direction in ["fwd", "bwd"]: + for i in range(stack_num): + if i != 0: + hidden = fluid.layers.fc( + name="__hidden%02d_%s__" % (i, direction), + size=hidden_dim, + act="stanh", + bias_attr=fluid.ParamAttr( + initializer=NormalInitializer( + loc=0.0, scale=1.0)), + input=[hidden, rnn[0], rnn[1]], + param_attr=[ + hidden_para_attr, rnn_para_attr, rnn_para_attr + ]) + rnn = fluid.layers.dynamic_lstm( + name="__rnn%02d_%s__" % (i, direction), + input=hidden, + size=hidden_dim, + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid', + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=1.0)), + is_reverse=(i % 2) if direction == "fwd" else not i % 2, + param_attr=rnn_para_attr) + fea += [hidden, rnn[0], rnn[1]] + + rnn_fea = fluid.layers.fc( + size=hidden_dim, + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))), + act="stanh", + input=fea, + param_attr=[hidden_para_attr, rnn_para_attr, rnn_para_attr] * 2) + + emission = fluid.layers.fc( + size=label_dict_len, + input=rnn_fea, + param_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)))) + + crf_cost = fluid.layers.linear_chain_crf( + input=emission, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', + initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)), + learning_rate=mix_hidden_lr)) + avg_cost = fluid.layers.mean(x=crf_cost) + return avg_cost, emission + + word = fluid.layers.data( + name='word', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data( + name='mark', shape=[1], dtype='int64', lod_level=1) + target = fluid.layers.data( + name="target", shape=[1], dtype='int64', lod_level=1) + + if parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + word_ = pd.read_input(word) + mark_ = pd.read_input(mark) + target_ = pd.read_input(target) + avg_cost, emission_base = _net_conf(word_, mark_, target_) + pd.write_output(avg_cost) + pd.write_output(emission_base) + avg_cost_list, emission = pd() + avg_cost = fluid.layers.mean(x=avg_cost_list) + emission.stop_gradient = True + else: + avg_cost, emission = _net_conf(word, mark, target) + + return avg_cost, emission, word, mark, target diff --git a/sequence_tagging_for_ner/reader.py b/sequence_tagging_for_ner/reader.py new file mode 100644 index 00000000..5050d0bf --- /dev/null +++ b/sequence_tagging_for_ner/reader.py @@ -0,0 +1,66 @@ +""" +Conll03 dataset. +""" + +from utils import * + +__all__ = ["data_reader"] + + +def canonicalize_digits(word): + if any([c.isalpha() for c in word]): return word + word = re.sub("\d", "DG", word) + if word.startswith("DG"): + word = word.replace(",", "") # remove thousands separator + return word + + +def canonicalize_word(word, wordset=None, digits=True): + word = word.lower() + if digits: + if (wordset != None) and (word in wordset): return word + word = canonicalize_digits(word) # try to canonicalize numbers + if (wordset == None) or (word in wordset): return word + else: return "UUUNKKK" # unknown token + + +def data_reader(data_file, word_dict, label_dict): + """ + The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/. + It returns a reader creator, each sample in the reader includes: + word id sequence, label id sequence and raw sentence. + + :return: reader creator + :rtype: callable + """ + + def reader(): + UNK_IDX = word_dict["UUUNKKK"] + + sentence = [] + labels = [] + with open(data_file, "r") as f: + for line in f: + if len(line.strip()) == 0: + if len(sentence) > 0: + word_idx = [ + word_dict.get( + canonicalize_word(w, word_dict), UNK_IDX) + for w in sentence + ] + mark = [1 if w[0].isupper() else 0 for w in sentence] + label_idx = [label_dict[l] for l in labels] + yield word_idx, mark, label_idx + sentence = [] + labels = [] + else: + segs = line.strip().split() + sentence.append(segs[0]) + # transform I-TYPE to BIO schema + if segs[-1] != "O" and (len(labels) == 0 or + labels[-1][1:] != segs[-1][1:]): + labels.append("B" + segs[-1][1:]) + else: + labels.append(segs[-1]) + + return reader diff --git a/sequence_tagging_for_ner/run.xsh b/sequence_tagging_for_ner/run.xsh new file mode 100755 index 00000000..9fda2d21 --- /dev/null +++ b/sequence_tagging_for_ner/run.xsh @@ -0,0 +1,16 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +cudaid=${sequence_tagging:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid +#pass_num 2200 +sh download.sh +FLAGS_benchmark=true python train.py + +cudaid=${sequence_tagging_m:=0,1,2,3} # use multi card as default +export CUDA_VISIBLE_DEVICES=$cudaid +#pass_num 2200 +sh download.sh +FLAGS_benchmark=true python train.py --gpu_card_num 4 diff --git a/sequence_tagging_for_ner/train.py b/sequence_tagging_for_ner/train.py new file mode 100644 index 00000000..a80e75b1 --- /dev/null +++ b/sequence_tagging_for_ner/train.py @@ -0,0 +1,149 @@ +import os +import time +import math +import numpy as np + +import paddle +import paddle.fluid as fluid +import argparse +import reader +from network_conf import ner_net +from utils import logger, load_dict +from utils_extend import to_lodtensor, get_embedding +from continuous_evaluation import * + +def parse_args(): + parser = argparse.ArgumentParser("sequence_tagging_for_ner model benchmark.") + parser.add_argument( + '--gpu_card_num', type=int, default=1, help='gpu card num used.') + + args = parser.parse_args() + return args + +def test(exe, chunk_evaluator, inference_program, test_data, place): + chunk_evaluator.reset(exe) + for data in test_data(): + word = to_lodtensor(map(lambda x: x[0], data), place) + mark = to_lodtensor(map(lambda x: x[1], data), place) + target = to_lodtensor(map(lambda x: x[2], data), place) + acc = exe.run(inference_program, + feed={"word": word, + "mark": mark, + "target": target}) + return chunk_evaluator.eval(exe) + + +def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, + model_save_dir, num_passes, use_gpu, parallel): + + args = parse_args() + if not os.path.exists(model_save_dir): + os.mkdir(model_save_dir) + + BATCH_SIZE = 200 + word_dict = load_dict(vocab_file) + label_dict = load_dict(target_file) + + word_vector_values = get_embedding(emb_file) + + word_dict_len = len(word_dict) + label_dict_len = len(label_dict) + + avg_cost, feature_out, word, mark, target = ner_net( + word_dict_len, label_dict_len, parallel) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + sgd_optimizer.minimize(avg_cost) + + crf_decode = fluid.layers.crf_decoding( + input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) + + chunk_evaluator = fluid.evaluator.ChunkEvaluator( + input=crf_decode, + label=target, + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + test_target = chunk_evaluator.metrics + chunk_evaluator.states + inference_program = fluid.io.get_inference_program(test_target) + + train_reader = paddle.batch( + paddle.reader.shuffle( + reader.data_reader(train_data_file, word_dict, label_dict), + buf_size=20000), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + paddle.reader.shuffle( + reader.data_reader(test_data_file, word_dict, label_dict), + buf_size=20000), + batch_size=BATCH_SIZE) + + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) + exe = fluid.Executor(place) + + exe.run(fluid.default_startup_program()) + + embedding_name = 'emb' + embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor( + ) + embedding_param.set(word_vector_values, place) + + batch_id = 0 + total_time = 0.0 + for pass_id in xrange(num_passes): + chunk_evaluator.reset(exe) + start_time = time.time() + for data in train_reader(): + cost, batch_precision, batch_recall, batch_f1_score = exe.run( + fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost] + chunk_evaluator.metrics) + batch_id = batch_id + 1 + t1 = time.time() + total_time += t1 - start_time + pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe) + if pass_id == num_passes - 1: + if args.gpu_card_num == 1: + train_acc_kpi.add_record(pass_precision) + pass_duration_kpi.add_record(total_time / num_passes) + else: + train_acc_kpi_card4.add_record(pass_precision) + pass_duration_kpi_card4.add_record(total_time / num_passes) + + if pass_id % 100 == 0: + print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + + str(pass_precision) + " pass_recall:" + str( + pass_recall) + " pass_f1_score:" + str(pass_f1_score)) + pass_precision, pass_recall, pass_f1_score = test( + exe, chunk_evaluator, inference_program, test_reader, place) + if pass_id % 100 == 0: + print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + + str(pass_precision) + " pass_recall:" + str( + pass_recall) + " pass_f1_score:" + str(pass_f1_score)) + + #save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id) + #fluid.io.save_inference_model( + # save_dirname, ['word', 'mark', 'target'], [crf_decode], exe) + + if args.gpu_card_num == 1: + train_acc_kpi.persist() + pass_duration_kpi.persist() + else: + train_acc_kpi_card4.persist() + pass_duration_kpi_card4.persist() + + +if __name__ == "__main__": + main( + train_data_file="data/train", + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + emb_file="data/wordVectors.txt", + model_save_dir="models", + num_passes=2300, + use_gpu=True, + parallel=True) diff --git a/sequence_tagging_for_ner/utils.py b/sequence_tagging_for_ner/utils.py new file mode 100644 index 00000000..f40f1bb1 --- /dev/null +++ b/sequence_tagging_for_ner/utils.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import logging +import os +import re +import argparse +import numpy as np +from collections import defaultdict + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +def get_embedding(emb_file='data/wordVectors.txt'): + """ + Get the trained word vector. + """ + return np.loadtxt(emb_file, dtype=float) + + +def load_dict(dict_path): + """ + Load the word dictionary from the given file. + Each line of the given file is a word, which can include multiple columns + seperated by tab. + + This function takes the first column (columns in a line are seperated by + tab) as key and takes line number of a line as the key (index of the word + in the dictionary). + """ + + return dict((line.strip().split("\t")[0], idx) + for idx, line in enumerate(open(dict_path, "r").readlines())) + + +def load_reverse_dict(dict_path): + """ + Load the word dictionary from the given file. + Each line of the given file is a word, which can include multiple columns + seperated by tab. + + This function takes line number of a line as the key (index of the word in + the dictionary) and the first column (columns in a line are seperated by + tab) as the value. + """ + return dict((idx, line.strip().split("\t")[0]) + for idx, line in enumerate(open(dict_path, "r").readlines())) diff --git a/sequence_tagging_for_ner/utils_extend.py b/sequence_tagging_for_ner/utils_extend.py new file mode 100644 index 00000000..03e7e62f --- /dev/null +++ b/sequence_tagging_for_ner/utils_extend.py @@ -0,0 +1,28 @@ +import numpy as np + +import paddle.fluid as fluid + + +def get_embedding(emb_file='data/wordVectors.txt'): + """ + Get the trained word vector. + """ + return np.loadtxt(emb_file, dtype='float32') + + +def to_lodtensor(data, place): + """ + convert data to lodtensor + """ + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res diff --git a/text_classification/README.md b/text_classification/README.md new file mode 100644 index 00000000..7855f6c2 --- /dev/null +++ b/text_classification/README.md @@ -0,0 +1,113 @@ +# 文本分类 + +以下是本例的简要目录结构及说明: + +```text +. +├── nets.py # 模型定义 +├── README.md # 文档 +├── train.py # 训练脚本 +├── infer.py # 预测脚本 +└── utils.py # 定义通用函数,从外部获取 +``` + + +## 简介,模型详解 + +在PaddlePaddle v2版本[文本分类](https://github.com/PaddlePaddle/models/blob/develop/text/README.md)中对于文本分类任务有较详细的介绍,在本例中不再重复介绍。 +在模型上,我们采用了bow, cnn, lstm, gru四种常见的文本分类模型。 + +## 训练 + +1. 运行命令 `python train.py bow` 开始训练模型。 + ```python + python train.py bow # bow指定网络结构,可替换成cnn, lstm, gru + ``` + +2. (可选)想自定义网络结构,需在[nets.py](./nets.py)中自行添加,并设置[train.py](./train.py)中的相应参数。 + ```python + def train(train_reader, # 训练数据 + word_dict, # 数据字典 + network, # 模型配置 + use_cuda, # 是否用GPU + parallel, # 是否并行 + save_dirname, # 保存模型路径 + lr=0.2, # 学习率大小 + batch_size=128, # 每个batch的样本数 + pass_num=30): # 训练的轮数 + ``` + +## 训练结果示例 +```text + pass_id: 0, avg_acc: 0.848040, avg_cost: 0.354073 + pass_id: 1, avg_acc: 0.914200, avg_cost: 0.217945 + pass_id: 2, avg_acc: 0.929800, avg_cost: 0.184302 + pass_id: 3, avg_acc: 0.938680, avg_cost: 0.164240 + pass_id: 4, avg_acc: 0.945120, avg_cost: 0.149150 + pass_id: 5, avg_acc: 0.951280, avg_cost: 0.137117 + pass_id: 6, avg_acc: 0.955360, avg_cost: 0.126434 + pass_id: 7, avg_acc: 0.961400, avg_cost: 0.117405 + pass_id: 8, avg_acc: 0.963560, avg_cost: 0.110070 + pass_id: 9, avg_acc: 0.965840, avg_cost: 0.103273 + pass_id: 10, avg_acc: 0.969800, avg_cost: 0.096314 + pass_id: 11, avg_acc: 0.971720, avg_cost: 0.090206 + pass_id: 12, avg_acc: 0.974800, avg_cost: 0.084970 + pass_id: 13, avg_acc: 0.977400, avg_cost: 0.078981 + pass_id: 14, avg_acc: 0.980000, avg_cost: 0.073685 + pass_id: 15, avg_acc: 0.981080, avg_cost: 0.069898 + pass_id: 16, avg_acc: 0.982080, avg_cost: 0.064923 + pass_id: 17, avg_acc: 0.984680, avg_cost: 0.060861 + pass_id: 18, avg_acc: 0.985840, avg_cost: 0.057095 + pass_id: 19, avg_acc: 0.988080, avg_cost: 0.052424 + pass_id: 20, avg_acc: 0.989160, avg_cost: 0.049059 + pass_id: 21, avg_acc: 0.990120, avg_cost: 0.045882 + pass_id: 22, avg_acc: 0.992080, avg_cost: 0.042140 + pass_id: 23, avg_acc: 0.992280, avg_cost: 0.039722 + pass_id: 24, avg_acc: 0.992840, avg_cost: 0.036607 + pass_id: 25, avg_acc: 0.994440, avg_cost: 0.034040 + pass_id: 26, avg_acc: 0.995000, avg_cost: 0.031501 + pass_id: 27, avg_acc: 0.995440, avg_cost: 0.028988 + pass_id: 28, avg_acc: 0.996240, avg_cost: 0.026639 + pass_id: 29, avg_acc: 0.996960, avg_cost: 0.024186 +``` + +## 预测 +1. 运行命令 `python infer.py bow_model`, 开始预测。 + ```python + python infer.py bow_model # bow_model指定需要导入的模型 + +## 预测结果示例 +```text + model_path: bow_model/epoch0, avg_acc: 0.882800 + model_path: bow_model/epoch1, avg_acc: 0.882360 + model_path: bow_model/epoch2, avg_acc: 0.881400 + model_path: bow_model/epoch3, avg_acc: 0.877800 + model_path: bow_model/epoch4, avg_acc: 0.872920 + model_path: bow_model/epoch5, avg_acc: 0.872640 + model_path: bow_model/epoch6, avg_acc: 0.869960 + model_path: bow_model/epoch7, avg_acc: 0.865160 + model_path: bow_model/epoch8, avg_acc: 0.863680 + model_path: bow_model/epoch9, avg_acc: 0.861200 + model_path: bow_model/epoch10, avg_acc: 0.853520 + model_path: bow_model/epoch11, avg_acc: 0.850400 + model_path: bow_model/epoch12, avg_acc: 0.855960 + model_path: bow_model/epoch13, avg_acc: 0.853480 + model_path: bow_model/epoch14, avg_acc: 0.855960 + model_path: bow_model/epoch15, avg_acc: 0.854120 + model_path: bow_model/epoch16, avg_acc: 0.854160 + model_path: bow_model/epoch17, avg_acc: 0.852240 + model_path: bow_model/epoch18, avg_acc: 0.852320 + model_path: bow_model/epoch19, avg_acc: 0.850280 + model_path: bow_model/epoch20, avg_acc: 0.849760 + model_path: bow_model/epoch21, avg_acc: 0.850160 + model_path: bow_model/epoch22, avg_acc: 0.846800 + model_path: bow_model/epoch23, avg_acc: 0.845440 + model_path: bow_model/epoch24, avg_acc: 0.845640 + model_path: bow_model/epoch25, avg_acc: 0.846200 + model_path: bow_model/epoch26, avg_acc: 0.845880 + model_path: bow_model/epoch27, avg_acc: 0.844880 + model_path: bow_model/epoch28, avg_acc: 0.844680 + model_path: bow_model/epoch29, avg_acc: 0.844960 +``` + +注:过拟合导致acc持续下降,请忽略 diff --git a/text_classification/continuous_evaluation.py b/text_classification/continuous_evaluation.py new file mode 100644 index 00000000..9d9c9240 --- /dev/null +++ b/text_classification/continuous_evaluation.py @@ -0,0 +1,19 @@ +""" +continuous_evaluation.py +""" +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi + +lstm_train_cost_kpi = CostKpi('lstm_train_cost', 5, 0) +lstm_pass_duration_kpi = DurationKpi('lstm_pass_duration', 0.02, 0, actived=True) + +lstm_train_cost_kpi_card4 = CostKpi('lstm_train_cost_card4', 0.2, 0) +lstm_pass_duration_kpi_card4 = DurationKpi('lstm_pass_duration_card4', 0.02, 0, actived=True) + +tracking_kpis = [ + lstm_train_cost_kpi, lstm_pass_duration_kpi, + lstm_train_cost_kpi_card4, lstm_pass_duration_kpi_card4, + ] diff --git a/text_classification/infer.py b/text_classification/infer.py new file mode 100644 index 00000000..d2a0363d --- /dev/null +++ b/text_classification/infer.py @@ -0,0 +1,50 @@ +import sys +import time +import unittest +import contextlib +import numpy as np + +import paddle.fluid as fluid +import paddle.v2 as paddle + +import utils + + +def infer(test_reader, use_cuda, model_path=None): + """ + inference function + """ + if model_path is None: + print(str(model_path) + " cannot be found") + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) + + total_acc = 0.0 + total_count = 0 + for data in test_reader(): + acc = exe.run(inference_program, + feed=utils.data2tensor(data, place), + fetch_list=fetch_targets, + return_numpy=True) + total_acc += acc[0] * len(data) + total_count += len(data) + + avg_acc = total_acc / total_count + print("model_path: %s, avg_acc: %f" % (model_path, avg_acc)) + + +if __name__ == "__main__": + word_dict, train_reader, test_reader = utils.prepare_data( + "imdb", self_dict=False, batch_size=128, buf_size=50000) + + model_path = sys.argv[1] + for i in range(30): + epoch_path = model_path + "/" + "epoch" + str(i) + infer(test_reader, use_cuda=False, model_path=epoch_path) diff --git a/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt b/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt new file mode 100644 index 00000000..bfd66206 --- /dev/null +++ b/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt @@ -0,0 +1 @@ +[17.750867716471355] \ No newline at end of file diff --git a/text_classification/latest_kpis/lstm_pass_duration_factor.txt b/text_classification/latest_kpis/lstm_pass_duration_factor.txt new file mode 100644 index 00000000..60ab6882 --- /dev/null +++ b/text_classification/latest_kpis/lstm_pass_duration_factor.txt @@ -0,0 +1 @@ +[15.24635027249654] diff --git a/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt b/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt new file mode 100644 index 00000000..f8d4e66e --- /dev/null +++ b/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt @@ -0,0 +1 @@ +[0.0030332264248281717] diff --git a/text_classification/latest_kpis/lstm_train_cost_factor.txt b/text_classification/latest_kpis/lstm_train_cost_factor.txt new file mode 100644 index 00000000..1224335d --- /dev/null +++ b/text_classification/latest_kpis/lstm_train_cost_factor.txt @@ -0,0 +1 @@ +[0.000792166159953922] diff --git a/text_classification/nets.py b/text_classification/nets.py new file mode 100644 index 00000000..cd572c72 --- /dev/null +++ b/text_classification/nets.py @@ -0,0 +1,124 @@ +import sys +import time +import numpy as np + +import paddle.fluid as fluid +import paddle + + +def bow_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + bow net + """ + emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def cnn_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + win_size=3): + """ + conv net + """ + emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) + + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=win_size, + act="tanh", + pool_type="max") + + fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2) + + prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def lstm_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + lstm net + """ + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh') + + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + + lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + + fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') + + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def gru_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=400.0): + """ + gru net + """ + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3) + gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) + gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction diff --git a/text_classification/run.xsh b/text_classification/run.xsh new file mode 100755 index 00000000..29c8faab --- /dev/null +++ b/text_classification/run.xsh @@ -0,0 +1,14 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +cudaid=${text_classification:=0} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --model lstm + +cudaid=${text_classification_m:=0,1,2,3} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +#LSTM pass_num 15 +FLAGS_benchmark=true python train.py --model lstm --gpu_card_num 4 diff --git a/text_classification/train.py b/text_classification/train.py new file mode 100644 index 00000000..dfb3f877 --- /dev/null +++ b/text_classification/train.py @@ -0,0 +1,162 @@ +import sys +import time +import unittest +import contextlib + +import paddle.fluid as fluid +import paddle +import argparse +import utils +from nets import bow_net +from nets import cnn_net +from nets import lstm_net +from nets import gru_net +from continuous_evaluation import * +fluid.default_startup_program().random_seed = 99 + +def parse_args(): + parser = argparse.ArgumentParser("text_classification model benchmark.") + parser.add_argument( + '--model', type=str, default="lstm", help='model to run.') + parser.add_argument( + '--gpu_card_num', type=int, default=1, help='gpu card num used.') + + args = parser.parse_args() + return args + +def train(train_reader, + word_dict, + network, + use_cuda, + parallel, + save_dirname, + lr=0.2, + batch_size=128, + pass_num=30): + """ + train network + """ + args = parse_args() + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + if not parallel: + cost, acc, prediction = network(data, label, len(word_dict)) + else: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + cost, acc, prediction = network( + pd.read_input(data), pd.read_input(label), len(word_dict)) + + pd.write_output(cost) + pd.write_output(acc) + + cost, acc = pd() + cost = fluid.layers.mean(cost) + acc = fluid.layers.mean(acc) + + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr) + sgd_optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + + exe.run(fluid.default_startup_program()) + total_time = 0.0 + newest_avg_cost = 0.0 + for pass_id in xrange(pass_num): + start_time = time.time() + data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0 + for data in train_reader(): + avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[cost, acc]) + data_size = len(data) + total_acc += data_size * avg_acc_np + total_cost += data_size * avg_cost_np + data_count += data_size + avg_cost = total_cost / data_count + newest_avg_cost = avg_cost + t1 = time.time() + total_time += t1 - start_time + avg_acc = total_acc / data_count + print("pass_id: %d, avg_acc: %f, avg_cost: %f" % + (pass_id, avg_acc, avg_cost)) + if pass_id == pass_num - 1: + if args.gpu_card_num == 1: + lstm_train_cost_kpi.add_record(newest_avg_cost) + lstm_pass_duration_kpi.add_record(total_time / pass_num) + else: + lstm_train_cost_kpi_card4.add_record(newest_avg_cost) + lstm_pass_duration_kpi_card4.add_record(total_time / pass_num) + + epoch_model = save_dirname + "/" + "epoch" + str(pass_id) + fluid.io.save_inference_model(epoch_model, ["words", "label"], acc, + exe) + if args.gpu_card_num == 1: + lstm_train_cost_kpi.persist() + lstm_pass_duration_kpi.persist() + else: + lstm_train_cost_kpi_card4.persist() + lstm_pass_duration_kpi_card4.persist() + +def train_net(): + args = parse_args() + word_dict, train_reader, test_reader = utils.prepare_data( + "imdb", self_dict=False, batch_size=128, buf_size=50000) + + if args.model == "bow": + train( + train_reader, + word_dict, + bow_net, + use_cuda=False, + parallel=False, + save_dirname="bow_model", + lr=0.002, + pass_num=30, + batch_size=128) + elif args.model == "cnn": + train( + train_reader, + word_dict, + cnn_net, + use_cuda=True, + parallel=False, + save_dirname="cnn_model", + lr=0.01, + pass_num=30, + batch_size=4) + elif args.model == "lstm": + train( + train_reader, + word_dict, + lstm_net, + use_cuda=True, + parallel=True, + save_dirname="lstm_model", + lr=0.05, + pass_num=15, + batch_size=4) + elif args.model == "gru": + train( + train_reader, + word_dict, + lstm_net, + use_cuda=True, + parallel=False, + save_dirname="gru_model", + lr=0.05, + pass_num=30, + batch_size=128) + else: + print("network name cannot be found!") + sys.exit(1) + + +if __name__ == "__main__": + train_net() diff --git a/text_classification/utils.py b/text_classification/utils.py new file mode 100644 index 00000000..bff77d11 --- /dev/null +++ b/text_classification/utils.py @@ -0,0 +1,99 @@ +import sys +import time +import numpy as np + +import paddle.fluid as fluid +import paddle + + +def to_lodtensor(data, place): + """ + convert to LODtensor + """ + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def load_vocab(filename): + """ + load imdb vocabulary + """ + vocab = {} + with open(filename) as f: + wid = 0 + for line in f: + vocab[line.strip()] = wid + wid += 1 + vocab[""] = len(vocab) + return vocab + + +def data2tensor(data, place): + """ + data2tensor + """ + input_seq = to_lodtensor(map(lambda x: x[0], data), place) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + return {"words": input_seq, "label": y_data} + + +def prepare_data(data_type="imdb", + self_dict=False, + batch_size=128, + buf_size=50000): + """ + prepare data + """ + if self_dict: + word_dict = load_vocab(data_type + ".vocab") + else: + if data_type == "imdb": + word_dict = paddle.dataset.imdb.word_dict() + elif data_type == "light_imdb": + word_dict = light_imdb.word_dict() + elif data_type == "tiny_imdb": + word_dict = tiny_imdb.word_dict() + else: + raise RuntimeError("No such dataset") + + if data_type == "imdb": + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), + batch_size=batch_size) + + test_reader = paddle.batch( + paddle.dataset.imdb.test(word_dict), + batch_size=batch_size) + + elif data_type == "light_imdb": + train_reader = paddle.batch( + light_imdb.train(word_dict), + batch_size=batch_size) + + test_reader = paddle.batch( + light_imdb.test(word_dict), + batch_size=batch_size) + + elif data_type == "tiny_imdb": + train_reader = paddle.batch( + tiny_imdb.train(word_dict), + batch_size=batch_size) + + test_reader = paddle.batch( + tiny_imdb.test(word_dict), + batch_size=batch_size) + else: + raise RuntimeError("no such dataset") + + return word_dict, train_reader, test_reader diff --git a/transformer/continuous_evaluation.py b/transformer/continuous_evaluation.py new file mode 100644 index 00000000..7a39755e --- /dev/null +++ b/transformer/continuous_evaluation.py @@ -0,0 +1,12 @@ +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +train_avg_ppl_kpi = CostKpi('train_avg_ppl_kpi', 0.2, 0) +train_pass_duration_kpi = DurationKpi('train_pass_duration_kpi', 0.2, 0) + +tracking_kpis = [ + train_avg_ppl_kpi, + train_pass_duration_kpi, +] diff --git a/transformer/infer.py b/transformer/infer.py new file mode 100644 index 00000000..7d0c9776 --- /dev/null +++ b/transformer/infer.py @@ -0,0 +1,354 @@ +import numpy as np + +import paddle +import paddle.fluid as fluid + +import model +from model import wrap_encoder as encoder +from model import wrap_decoder as decoder +from transformer_config import * +from train import pad_batch_data + + +def translate_batch(exe, + src_words, + encoder, + enc_in_names, + enc_out_names, + decoder, + dec_in_names, + dec_out_names, + beam_size, + max_length, + n_best, + batch_size, + n_head, + d_model, + src_pad_idx, + trg_pad_idx, + bos_idx, + eos_idx, + unk_idx, + output_unk=True): + """ + Run the encoder program once and run the decoder program multiple times to + implement beam search externally. + """ + # Prepare data for encoder and run the encoder. + enc_in_data = pad_batch_data( + src_words, + src_pad_idx, + n_head, + is_target=False, + is_label=False, + return_attn_bias=True, + return_max_len=False) + # Append the data shape input to reshape the output of embedding layer. + enc_in_data = enc_in_data + [ + np.array( + [-1, enc_in_data[2].shape[-1], d_model], dtype="int32") + ] + # Append the shape inputs to reshape before and after softmax in encoder + # self attention. + enc_in_data = enc_in_data + [ + np.array( + [-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array( + enc_in_data[2].shape, dtype="int32") + ] + enc_output = exe.run(encoder, + feed=dict(zip(enc_in_names, enc_in_data)), + fetch_list=enc_out_names)[0] + + # Beam Search. + # To store the beam info. + scores = np.zeros((batch_size, beam_size), dtype="float32") + prev_branchs = [[] for i in range(batch_size)] + next_ids = [[] for i in range(batch_size)] + # Use beam_inst_map to map beam idx to the instance idx in batch, since the + # size of feeded batch is changing. + beam_inst_map = { + beam_idx: inst_idx + for inst_idx, beam_idx in enumerate(range(batch_size)) + } + # Use active_beams to recode the alive. + active_beams = range(batch_size) + + def beam_backtrace(prev_branchs, next_ids, n_best=beam_size): + """ + Decode and select n_best sequences for one instance by backtrace. + """ + seqs = [] + for i in range(n_best): + k = i + seq = [] + for j in range(len(prev_branchs) - 1, -1, -1): + seq.append(next_ids[j][k]) + k = prev_branchs[j][k] + seq = seq[::-1] + # Add the , since next_ids don't include the . + seq = [bos_idx] + seq + seqs.append(seq) + return seqs + + def init_dec_in_data(batch_size, beam_size, enc_in_data, enc_output): + """ + Initialize the input data for decoder. + """ + trg_words = np.array( + [[bos_idx]] * batch_size * beam_size, dtype="int64") + trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64") + src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[ + -1], enc_in_data[2], 1 + # This is used to remove attention on subsequent words. + trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len, + trg_max_len)) + trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape( + [-1, 1, trg_max_len, trg_max_len]) + trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) * + [-1e9]).astype("float32") + # This is used to remove attention on the paddings of source sequences. + trg_src_attn_bias = np.tile( + src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis], + [1, beam_size, 1, trg_max_len, 1]).reshape([ + -1, src_slf_attn_bias.shape[1], trg_max_len, + src_slf_attn_bias.shape[-1] + ]) + # Append the shape input to reshape the output of embedding layer. + trg_data_shape = np.array( + [batch_size * beam_size, trg_max_len, d_model], dtype="int32") + # Append the shape inputs to reshape before and after softmax in + # decoder self attention. + trg_slf_attn_pre_softmax_shape = np.array( + [-1, trg_slf_attn_bias.shape[-1]], dtype="int32") + trg_slf_attn_post_softmax_shape = np.array( + trg_slf_attn_bias.shape, dtype="int32") + # Append the shape inputs to reshape before and after softmax in + # encoder-decoder attention. + trg_src_attn_pre_softmax_shape = np.array( + [-1, trg_src_attn_bias.shape[-1]], dtype="int32") + trg_src_attn_post_softmax_shape = np.array( + trg_src_attn_bias.shape, dtype="int32") + enc_output = np.tile( + enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape( + [-1, enc_output.shape[-2], enc_output.shape[-1]]) + return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output + + def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map): + """ + Update the input data of decoder mainly by slicing from the previous + input data and dropping the finished instance beams. + """ + trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output = dec_in_data + trg_cur_len = trg_slf_attn_bias.shape[-1] + 1 + trg_words = np.array( + [ + beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx]) + for beam_idx in active_beams + ], + dtype="int64") + trg_words = trg_words.reshape([-1, 1]) + trg_pos = np.array( + [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size, + dtype="int64").reshape([-1, 1]) + active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams] + active_beams_indice = ( + (np.array(active_beams) * beam_size)[:, np.newaxis] + + np.array(range(beam_size))[np.newaxis, :]).flatten() + # This is used to remove attention on subsequent words. + trg_slf_attn_bias = np.ones((len(active_beams) * beam_size, + trg_cur_len, trg_cur_len)) + trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape( + [-1, 1, trg_cur_len, trg_cur_len]) + trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) * + [-1e9]).astype("float32") + # This is used to remove attention on the paddings of source sequences. + trg_src_attn_bias = np.tile(trg_src_attn_bias[ + active_beams_indice, :, ::trg_src_attn_bias.shape[2], :], + [1, 1, trg_cur_len, 1]) + # Append the shape input to reshape the output of embedding layer. + trg_data_shape = np.array( + [len(active_beams) * beam_size, trg_cur_len, d_model], + dtype="int32") + # Append the shape inputs to reshape before and after softmax in + # decoder self attention. + trg_slf_attn_pre_softmax_shape = np.array( + [-1, trg_slf_attn_bias.shape[-1]], dtype="int32") + trg_slf_attn_post_softmax_shape = np.array( + trg_slf_attn_bias.shape, dtype="int32") + # Append the shape inputs to reshape before and after softmax in + # encoder-decoder attention. + trg_src_attn_pre_softmax_shape = np.array( + [-1, trg_src_attn_bias.shape[-1]], dtype="int32") + trg_src_attn_post_softmax_shape = np.array( + trg_src_attn_bias.shape, dtype="int32") + enc_output = enc_output[active_beams_indice, :, :] + return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output + + dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data, + enc_output) + for i in range(max_length): + predict_all = exe.run(decoder, + feed=dict(zip(dec_in_names, dec_in_data)), + fetch_list=dec_out_names)[0] + predict_all = np.log( + predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1]) + [:, -1, :]) + predict_all = (predict_all + scores[active_beams].reshape( + [len(beam_inst_map) * beam_size, -1])).reshape( + [len(beam_inst_map), beam_size, -1]) + if not output_unk: # To exclude the token. + predict_all[:, :, unk_idx] = -1e9 + active_beams = [] + for beam_idx in range(batch_size): + if not beam_inst_map.has_key(beam_idx): + continue + inst_idx = beam_inst_map[beam_idx] + predict = (predict_all[inst_idx, :, :] + if i != 0 else predict_all[inst_idx, 0, :]).flatten() + top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:] + top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice]) + [::-1]] + top_scores = predict[top_scores_ids] + scores[beam_idx] = top_scores + prev_branchs[beam_idx].append(top_scores_ids / + predict_all.shape[-1]) + next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1]) + if next_ids[beam_idx][-1][0] != eos_idx: + active_beams.append(beam_idx) + if len(active_beams) == 0: + break + dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams, + beam_inst_map) + beam_inst_map = { + beam_idx: inst_idx + for inst_idx, beam_idx in enumerate(active_beams) + } + + # Decode beams and select n_best sequences for each instance by backtrace. + seqs = [ + beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best) + for beam_idx in range(batch_size) + ] + + return seqs, scores[:, :n_best].tolist() + + +def main(): + place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + encoder_program = fluid.Program() + with fluid.program_guard(main_program=encoder_program): + enc_output = encoder( + ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout) + + decoder_program = fluid.Program() + with fluid.program_guard(main_program=decoder_program): + predict = decoder( + ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout) + + # Load model parameters of encoder and decoder separately from the saved + # transformer model. + encoder_var_names = [] + for op in encoder_program.block(0).ops: + encoder_var_names += op.input_arg_names + encoder_param_names = filter( + lambda var_name: isinstance(encoder_program.block(0).var(var_name), + fluid.framework.Parameter), + encoder_var_names) + encoder_params = map(encoder_program.block(0).var, encoder_param_names) + decoder_var_names = [] + for op in decoder_program.block(0).ops: + decoder_var_names += op.input_arg_names + decoder_param_names = filter( + lambda var_name: isinstance(decoder_program.block(0).var(var_name), + fluid.framework.Parameter), + decoder_var_names) + decoder_params = map(decoder_program.block(0).var, decoder_param_names) + fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=encoder_params) + fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params) + + # This is used here to set dropout to the test mode. + encoder_program = fluid.io.get_inference_program( + target_vars=[enc_output], main_program=encoder_program) + decoder_program = fluid.io.get_inference_program( + target_vars=[predict], main_program=decoder_program) + + test_data = paddle.batch( + paddle.dataset.wmt16.test(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=InferTaskConfig.batch_size) + + trg_idx2word = paddle.dataset.wmt16.get_dict( + "de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True) + + def post_process_seq(seq, + bos_idx=ModelHyperParams.bos_idx, + eos_idx=ModelHyperParams.eos_idx, + output_bos=InferTaskConfig.output_bos, + output_eos=InferTaskConfig.output_eos): + """ + Post-process the beam-search decoded sequence. Truncate from the first + and remove the and tokens currently. + """ + eos_pos = len(seq) - 1 + for i, idx in enumerate(seq): + if idx == eos_idx: + eos_pos = i + break + seq = seq[:eos_pos + 1] + return filter( + lambda idx: (output_bos or idx != bos_idx) and \ + (output_eos or idx != eos_idx), + seq) + + for batch_id, data in enumerate(test_data()): + batch_seqs, batch_scores = translate_batch( + exe, + [item[0] for item in data], + encoder_program, + encoder_data_input_fields + encoder_util_input_fields, + [enc_output.name], + decoder_program, + decoder_data_input_fields[:-1] + decoder_util_input_fields + + (decoder_data_input_fields[-1], ), + [predict.name], + InferTaskConfig.beam_size, + InferTaskConfig.max_length, + InferTaskConfig.n_best, + len(data), + ModelHyperParams.n_head, + ModelHyperParams.d_model, + ModelHyperParams.eos_idx, # Use eos_idx to pad. + ModelHyperParams.eos_idx, # Use eos_idx to pad. + ModelHyperParams.bos_idx, + ModelHyperParams.eos_idx, + ModelHyperParams.unk_idx, + output_unk=InferTaskConfig.output_unk) + for i in range(len(batch_seqs)): + # Post-process the beam-search decoded sequences. + seqs = map(post_process_seq, batch_seqs[i]) + scores = batch_scores[i] + for seq in seqs: + print(" ".join([trg_idx2word[idx] for idx in seq])) + + +if __name__ == "__main__": + main() diff --git a/transformer/latest_kpis/train_avg_ppl_kpi_factor.txt b/transformer/latest_kpis/train_avg_ppl_kpi_factor.txt new file mode 100644 index 00000000..4075807a --- /dev/null +++ b/transformer/latest_kpis/train_avg_ppl_kpi_factor.txt @@ -0,0 +1 @@ +[19.267375946044922] diff --git a/transformer/latest_kpis/train_pass_duration_kpi_factor.txt b/transformer/latest_kpis/train_pass_duration_kpi_factor.txt new file mode 100644 index 00000000..3cf0a471 --- /dev/null +++ b/transformer/latest_kpis/train_pass_duration_kpi_factor.txt @@ -0,0 +1 @@ +[56.41797208786011] diff --git a/transformer/model.py b/transformer/model.py new file mode 100644 index 00000000..f2ffb88e --- /dev/null +++ b/transformer/model.py @@ -0,0 +1,578 @@ +from functools import partial +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as layers + +from transformer_config import * + + +def position_encoding_init(n_position, d_pos_vec): + """ + Generate the initial values for the sinusoid position encoding table. + """ + position_enc = np.array([[ + pos / np.power(10000, 2 * (j // 2) / d_pos_vec) + for j in range(d_pos_vec) + ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + return position_enc.astype("float32") + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + pre_softmax_shape=None, + post_softmax_shape=None): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + param_attr=fluid.initializer.Xavier( + uniform=False, + fan_in=d_model * d_key, + fan_out=n_head * d_key), + bias_attr=False, + num_flatten_dims=2) + k = layers.fc(input=keys, + size=d_key * n_head, + param_attr=fluid.initializer.Xavier( + uniform=False, + fan_in=d_model * d_key, + fan_out=n_head * d_key), + bias_attr=False, + num_flatten_dims=2) + v = layers.fc(input=values, + size=d_value * n_head, + param_attr=fluid.initializer.Xavier( + uniform=False, + fan_in=d_model * d_value, + fan_out=n_head * d_value), + bias_attr=False, + num_flatten_dims=2) + return q, k, v + + def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + if n_head == 1: + return x + + hidden_size = x.shape[-1] + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped = layers.reshape( + x=x, shape=[0, -1, n_head, hidden_size // n_head]) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return layers.reshape( + x=trans_x, + shape=map(int, [0, -1, trans_x.shape[2] * trans_x.shape[3]])) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_model, + dropout_rate): + """ + Scaled Dot-Product Attention + """ + scaled_q = layers.scale(x=q, scale=d_model**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + weights = layers.reshape( + x=layers.elementwise_add( + x=product, y=attn_bias) if attn_bias else product, + shape=[-1, product.shape[-1]], + actual_shape=pre_softmax_shape, + act="softmax") + weights = layers.reshape( + x=weights, shape=product.shape, actual_shape=post_softmax_shape) + if dropout_rate: + weights = layers.dropout( + weights, dropout_prob=dropout_rate, is_test=False) + out = layers.matmul(weights, v) + return out + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + param_attr=fluid.initializer.Xavier(uniform=False), + bias_attr=False, + num_flatten_dims=2) + return proj_out + + +def positionwise_feed_forward(x, d_inner_hid, d_hid): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + param_attr=fluid.initializer.Uniform( + low=-(d_hid**-0.5), high=(d_hid**-0.5)), + act="relu") + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=2, + param_attr=fluid.initializer.Uniform( + low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5))) + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.initializer.Constant(1.), + bias_attr=fluid.initializer.Constant(0.)) + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, dropout_prob=dropout_rate, is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def prepare_encoder(src_word, + src_pos, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0., + src_data_shape=None, + pos_enc_param_name=None): + """Add word embeddings and position encodings. + The output tensor has a shape of: + [batch_size, max_src_length_in_batch, d_model]. + This module is used at the bottom of the encoder stacks. + """ + src_word_emb = layers.embedding( + src_word, + size=[src_vocab_size, src_emb_dim], + param_attr=fluid.initializer.Normal(0., 1.)) + src_pos_enc = layers.embedding( + src_pos, + size=[src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, trainable=False)) + enc_input = src_word_emb + src_pos_enc + enc_input = layers.reshape( + x=enc_input, + shape=[-1, src_max_len, src_emb_dim], + actual_shape=src_data_shape) + return layers.dropout( + enc_input, dropout_prob=dropout_rate, + is_test=False) if dropout_rate else enc_input + + +prepare_encoder = partial( + prepare_encoder, pos_enc_param_name=pos_enc_param_names[0]) +prepare_decoder = partial( + prepare_encoder, pos_enc_param_name=pos_enc_param_names[1]) + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0., + pre_softmax_shape=None, + post_softmax_shape=None): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + attn_output = multi_head_attention( + enc_input, enc_input, enc_input, attn_bias, d_key, d_value, d_model, + n_head, dropout_rate, pre_softmax_shape, post_softmax_shape) + attn_output = post_process_layer(enc_input, attn_output, "dan", + dropout_rate) + ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model) + return post_process_layer(attn_output, ffd_output, "dan", dropout_rate) + + +def encoder(enc_input, + attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0., + pre_softmax_shape=None, + post_softmax_shape=None): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + for i in range(n_layer): + enc_output = encoder_layer( + enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + pre_softmax_shape, + post_softmax_shape, ) + enc_input = enc_output + return enc_output + + +def decoder_layer(dec_input, + enc_output, + slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0., + slf_attn_pre_softmax_shape=None, + slf_attn_post_softmax_shape=None, + src_attn_pre_softmax_shape=None, + src_attn_post_softmax_shape=None): + """ The layer to be stacked in decoder part. + The structure of this module is similar to that in the encoder part except + a multi-head attention is added to implement encoder-decoder attention. + """ + slf_attn_output = multi_head_attention( + dec_input, + dec_input, + dec_input, + slf_attn_bias, + d_key, + d_value, + d_model, + n_head, + dropout_rate, + slf_attn_pre_softmax_shape, + slf_attn_post_softmax_shape, ) + slf_attn_output = post_process_layer( + dec_input, + slf_attn_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + enc_attn_output = multi_head_attention( + slf_attn_output, + enc_output, + enc_output, + dec_enc_attn_bias, + d_key, + d_value, + d_model, + n_head, + dropout_rate, + src_attn_pre_softmax_shape, + src_attn_post_softmax_shape, ) + enc_attn_output = post_process_layer( + slf_attn_output, + enc_attn_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + ffd_output = positionwise_feed_forward( + enc_attn_output, + d_inner_hid, + d_model, ) + dec_output = post_process_layer( + enc_attn_output, + ffd_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + return dec_output + + +def decoder(dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0., + slf_attn_pre_softmax_shape=None, + slf_attn_post_softmax_shape=None, + src_attn_pre_softmax_shape=None, + src_attn_post_softmax_shape=None): + """ + The decoder is composed of a stack of identical decoder_layer layers. + """ + for i in range(n_layer): + dec_output = decoder_layer( + dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + slf_attn_pre_softmax_shape, + slf_attn_post_softmax_shape, + src_attn_pre_softmax_shape, + src_attn_post_softmax_shape, ) + dec_input = dec_output + return dec_output + + +def make_all_inputs(input_fields): + """ + Define the input data layers for the transformer model. + """ + inputs = [] + for input_field in input_fields: + input_var = layers.data( + name=input_field, + shape=input_descs[input_field][0], + dtype=input_descs[input_field][1], + append_batch_size=False) + inputs.append(input_var) + fluid.default_startup_program().global_block().clone_variable( + input_var) + return inputs + + +def transformer( + src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + label_smooth_eps, ): + enc_inputs = make_all_inputs(encoder_data_input_fields + + encoder_util_input_fields) + + enc_output = wrap_encoder( + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + enc_inputs, ) + + dec_inputs = make_all_inputs(decoder_data_input_fields[:-1] + + decoder_util_input_fields) + + predict = wrap_decoder( + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + dec_inputs, + enc_output, ) + + # Padding index do not contribute to the total loss. The weights is used to + # cancel padding index in calculating the loss. + label, weights = make_all_inputs(label_data_input_fields) + if label_smooth_eps: + label = layers.label_smooth( + label=layers.one_hot( + input=label, depth=trg_vocab_size), + epsilon=label_smooth_eps) + cost = layers.softmax_with_cross_entropy( + logits=predict, + label=label, + soft_label=True if label_smooth_eps else False) + # cost = layers.softmax_with_cross_entropy(logits=predict, label=gold) + weighted_cost = cost * weights + sum_cost = layers.reduce_sum(weighted_cost) + token_num = layers.reduce_sum(weights) + avg_cost = sum_cost / token_num + return sum_cost, avg_cost, predict, token_num + + +def wrap_encoder(src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + enc_inputs=None): + """ + The wrapper assembles together all needed layers for the encoder. + """ + if enc_inputs is None: + # This is used to implement independent encoder program in inference. + src_word, src_pos, src_slf_attn_bias, src_data_shape, \ + slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \ + make_all_inputs(encoder_data_input_fields + + encoder_util_input_fields) + else: + src_word, src_pos, src_slf_attn_bias, src_data_shape, \ + slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \ + enc_inputs + enc_input = prepare_encoder( + src_word, + src_pos, + src_vocab_size, + d_model, + max_length, + dropout_rate, + src_data_shape, ) + enc_output = encoder( + enc_input, + src_slf_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + slf_attn_pre_softmax_shape, + slf_attn_post_softmax_shape, ) + return enc_output + + +def wrap_decoder(trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + dec_inputs=None, + enc_output=None): + """ + The wrapper assembles together all needed layers for the decoder. + """ + if dec_inputs is None: + # This is used to implement independent decoder program in inference. + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + enc_output, trg_data_shape, slf_attn_pre_softmax_shape, \ + slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ + src_attn_post_softmax_shape = make_all_inputs( + decoder_data_input_fields + decoder_util_input_fields) + else: + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, slf_attn_pre_softmax_shape, \ + slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ + src_attn_post_softmax_shape = dec_inputs + + dec_input = prepare_decoder( + trg_word, + trg_pos, + trg_vocab_size, + d_model, + max_length, + dropout_rate, + trg_data_shape, ) + dec_output = decoder( + dec_input, + enc_output, + trg_slf_attn_bias, + trg_src_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + slf_attn_pre_softmax_shape, + slf_attn_post_softmax_shape, + src_attn_pre_softmax_shape, + src_attn_post_softmax_shape, ) + # Return logits for training and probs for inference. + predict = layers.reshape( + x=layers.fc(input=dec_output, + size=trg_vocab_size, + bias_attr=False, + num_flatten_dims=2), + shape=[-1, trg_vocab_size], + act="softmax" if dec_inputs is None else None) + return predict diff --git a/transformer/optim.py b/transformer/optim.py new file mode 100644 index 00000000..56b5af3b --- /dev/null +++ b/transformer/optim.py @@ -0,0 +1,37 @@ +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as layers + + +class LearningRateScheduler(object): + """ + Wrapper for learning rate scheduling as described in the Transformer paper. + LearningRateScheduler adapts the learning rate externally and the adapted + learning rate will be feeded into the main_program as input data. + """ + + def __init__(self, + d_model, + warmup_steps, + learning_rate=0.001, + current_steps=0, + name="learning_rate"): + self.current_steps = current_steps + self.warmup_steps = warmup_steps + self.d_model = d_model + self.static_lr = learning_rate + self.learning_rate = layers.create_global_var( + name=name, + shape=[1], + value=float(learning_rate), + dtype="float32", + persistable=True) + + def update_learning_rate(self): + self.current_steps += 1 + lr_value = np.power(self.d_model, -0.5) * np.min([ + np.power(self.current_steps, -0.5), + np.power(self.warmup_steps, -1.5) * self.current_steps + ]) + return np.array([lr_value], dtype="float32") diff --git a/transformer/run.xsh b/transformer/run.xsh new file mode 100755 index 00000000..2f6f1ffd --- /dev/null +++ b/transformer/run.xsh @@ -0,0 +1,8 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +cudaid=${transformer_cudaid:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py diff --git a/transformer/train.py b/transformer/train.py new file mode 100644 index 00000000..f1b3bfe1 --- /dev/null +++ b/transformer/train.py @@ -0,0 +1,279 @@ +import os +import time +import numpy as np + +import paddle +import paddle.fluid as fluid + +from model import transformer, position_encoding_init +from optim import LearningRateScheduler +from transformer_config import * +from continuous_evaluation import train_avg_ppl_kpi, train_pass_duration_kpi + + +def pad_batch_data(insts, + pad_idx, + n_head, + is_target=False, + is_label=False, + return_attn_bias=True, + return_max_len=True, + return_num_token=False): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + num_token = reduce( + lambda x, y: x + y, + [len(inst) for inst in insts]) if return_num_token else 0 + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if is_label: # label weight + inst_weight = np.array([[1.] * len(inst) + [0.] * (max_len - len(inst)) + for inst in insts]) + return_list += [inst_weight.astype("float32").reshape([-1, 1])] + else: # position data + inst_pos = np.array([ + range(1, len(inst) + 1) + [0] * (max_len - len(inst)) + for inst in insts + ]) + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones( + (inst_data.shape[0], max_len, max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( + [-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + if return_num_token: + return_list += [num_token] + return return_list if len(return_list) > 1 else return_list[0] + + +def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx, + trg_pad_idx, n_head, d_model): + """ + Put all padded data needed by training into a dict. + """ + src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + + # These shape tensors are used in reshape_op. + src_data_shape = np.array([-1, src_max_len, d_model], dtype="int32") + trg_data_shape = np.array([-1, trg_max_len, d_model], dtype="int32") + src_slf_attn_pre_softmax_shape = np.array( + [-1, src_slf_attn_bias.shape[-1]], dtype="int32") + src_slf_attn_post_softmax_shape = np.array( + [-1] + list(src_slf_attn_bias.shape[1:]), dtype="int32") + trg_slf_attn_pre_softmax_shape = np.array( + [-1, trg_slf_attn_bias.shape[-1]], dtype="int32") + trg_slf_attn_post_softmax_shape = np.array( + [-1] + list(trg_slf_attn_bias.shape[1:]), dtype="int32") + trg_src_attn_pre_softmax_shape = np.array( + [-1, trg_src_attn_bias.shape[-1]], dtype="int32") + trg_src_attn_post_softmax_shape = np.array( + [-1] + list(trg_src_attn_bias.shape[1:]), dtype="int32") + + lbl_word, lbl_weight, num_token = pad_batch_data( + [inst[2] for inst in insts], + trg_pad_idx, + n_head, + is_target=False, + is_label=True, + return_attn_bias=False, + return_max_len=False, + return_num_token=True) + + data_input_dict = dict( + zip(data_input_names, [ + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ])) + util_input_dict = dict( + zip(util_input_names, [ + src_data_shape, src_slf_attn_pre_softmax_shape, + src_slf_attn_post_softmax_shape, trg_data_shape, + trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, + trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape + ])) + return data_input_dict, util_input_dict, np.asarray( + [num_token], dtype="float32") + + +def read_multiple(reader, count): + def __impl__(): + res = [] + for item in reader(): + res.append(item) + if len(res) == count: + yield res + res = [] + + if len(res) == count: + yield res + + return __impl__ + + +def main(): + place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + sum_cost, avg_cost, predict, token_num = transformer( + ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, + ModelHyperParams.n_head, ModelHyperParams.d_key, + ModelHyperParams.d_value, ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, + TrainTaskConfig.label_smooth_eps) + + lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, + TrainTaskConfig.warmup_steps, + TrainTaskConfig.learning_rate) + optimizer = fluid.optimizer.Adam( + learning_rate=lr_scheduler.learning_rate, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + optimizer.minimize(sum_cost) + + dev_count = fluid.core.get_cuda_device_count() + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt16.train(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + buf_size=100000), + batch_size=TrainTaskConfig.batch_size) + + # Program to do validation. + test_program = fluid.default_main_program().clone() + with fluid.program_guard(test_program): + test_program = fluid.io.get_inference_program([avg_cost]) + val_data = paddle.batch( + paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=TrainTaskConfig.batch_size) + + def test(exe): + test_total_cost = 0 + test_total_token = 0 + test_data = read_multiple(reader=val_data, count=dev_count) + for batch_id, data in enumerate(test_data()): + feed_list = [] + for place_id, data_buffer in enumerate(data): + data_input_dict, util_input_dict, _ = prepare_batch_input( + data_buffer, data_input_names, util_input_names, + ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, + ModelHyperParams.n_head, ModelHyperParams.d_model) + feed_list.append( + dict(data_input_dict.items() + util_input_dict.items())) + + outs = exe.run(feed=feed_list, + fetch_list=[sum_cost.name, token_num.name]) + sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) + test_total_cost += sum_cost_val.sum() + test_total_token += token_num_val.sum() + test_avg_cost = test_total_cost / test_total_token + test_ppl = np.exp([min(test_avg_cost, 100)]) + return test_avg_cost, test_ppl + + # Initialize the parameters. + if TrainTaskConfig.ckpt_path: + fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) + lr_scheduler.current_steps = TrainTaskConfig.start_step + else: + exe.run(fluid.framework.default_startup_program()) + + data_input_names = encoder_data_input_fields + decoder_data_input_fields[: + -1] + label_data_input_fields + util_input_names = encoder_util_input_fields + decoder_util_input_fields + + train_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + loss_name=sum_cost.name) + + test_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + main_program=test_program, + share_vars_from=train_exe) + + init = False + train_data = read_multiple(reader=train_data, count=dev_count) + + for pass_id in xrange(TrainTaskConfig.pass_num): + pass_start_time = time.time() + for batch_id, data in enumerate(train_data()): + feed_list = [] + total_num_token = 0 + lr_rate = lr_scheduler.update_learning_rate() + for place_id, data_buffer in enumerate(data): + data_input_dict, util_input_dict, num_token = prepare_batch_input( + data_buffer, data_input_names, util_input_names, + ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, + ModelHyperParams.n_head, ModelHyperParams.d_model) + total_num_token += num_token + feed_list.append( + dict(data_input_dict.items() + util_input_dict.items() + + {lr_scheduler.learning_rate.name: lr_rate}.items())) + + if not init: + for pos_enc_param_name in pos_enc_param_names: + tensor = position_encoding_init( + ModelHyperParams.max_length + 1, + ModelHyperParams.d_model) + feed_list[place_id][pos_enc_param_name] = tensor + for feed_dict in feed_list: + feed_dict[ + sum_cost.name + + "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray( + [1.], dtype="float32") + outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name], + feed=feed_list) + sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) + total_sum_cost = sum_cost_val.sum( + ) # sum the cost from multi devices + total_token_num = token_num_val.sum() + total_avg_cost = total_sum_cost / total_token_num + print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % + (pass_id, batch_id, total_sum_cost, total_avg_cost, + np.exp([min(total_avg_cost, 100)]))) + init = True + # Validate and save the model for inference. + val_avg_cost, val_ppl = test(test_exe) + pass_end_time = time.time() + time_consumed = pass_end_time - pass_start_time + print("pass_id = " + str(pass_id) + " time_consumed = " + str( + time_consumed)) + if pass_id == TrainTaskConfig.pass_num - 1: + train_avg_ppl_kpi.add_record(np.array(val_ppl, dtype='float32')) + train_pass_duration_kpi.add_record(time_consumed) + train_avg_ppl_kpi.persist() + train_pass_duration_kpi.persist() + + +if __name__ == "__main__": + main() diff --git a/transformer/transformer_config.py b/transformer/transformer_config.py new file mode 100644 index 00000000..d37636d9 --- /dev/null +++ b/transformer/transformer_config.py @@ -0,0 +1,184 @@ +class TrainTaskConfig(object): + use_gpu = True + # the epoch number to train. + pass_num = 5 + # the number of sequences contained in a mini-batch. + batch_size = 64 + # the hyper parameters for Adam optimizer. + learning_rate = 0.001 + beta1 = 0.9 + beta2 = 0.98 + eps = 1e-9 + # the parameters for learning rate scheduling. + warmup_steps = 4000 + # the flag indicating to use average loss or sum loss when training. + use_avg_cost = True + # the weight used to mix up the ground-truth distribution and the fixed + # uniform distribution in label smoothing when training. + # Set this as zero if label smoothing is not wanted. + label_smooth_eps = 0.1 + # the directory for saving trained models. + model_dir = "trained_models" + # the directory for saving checkpoints. + ckpt_dir = "trained_ckpts" + # the directory for loading checkpoint. + # If provided, continue training from the checkpoint. + ckpt_path = None + # the parameter to initialize the learning rate scheduler. + # It should be provided if use checkpoints, since the checkpoint doesn't + # include the training step counter currently. + start_step = 0 + + +class InferTaskConfig(object): + use_gpu = True + # the number of examples in one run for sequence generation. + batch_size = 10 + # the parameters for beam search. + beam_size = 5 + max_length = 30 + # the number of decoded sentences to output. + n_best = 1 + # the flags indicating whether to output the special tokens. + output_bos = False + output_eos = False + output_unk = False + # the directory for loading the trained model. + model_path = 'trained_models/pass_10.infer.model' + + +class ModelHyperParams(object): + # This model directly uses paddle.dataset.wmt16 in which , and + # token has alreay been added. As for the token, any token + # included in dict can be used to pad, since the paddings' loss will be + # masked out and make no effect on parameter gradients. + # size of source word dictionary. + src_vocab_size = 10000 + # size of target word dictionay + trg_vocab_size = 10000 + # index for token + bos_idx = 0 + # index for token + eos_idx = 1 + # index for token + unk_idx = 2 + # max length of sequences. + # The size of position encoding table should at least plus 1, since the + # sinusoid position encoding starts from 1 and 0 can be used as the padding + # token for position encoding. + max_length = 50 + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 1024 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + n_layer = 6 + # dropout rate used by all dropout layers. + dropout = 0.1 + + +# Here list the data shapes and data types of all inputs. +# The shapes here act as placeholder and are set to pass the infer-shape in +# compile time. +input_descs = { + # The actual data shape of src_word is: + # [batch_size * max_src_len_in_batch, 1] + "src_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], + # The actual data shape of src_pos is: + # [batch_size * max_src_len_in_batch, 1] + "src_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], + # This input is used to remove attention weights on paddings in the + # encoder. + # The actual data shape of src_slf_attn_bias is: + # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch] + "src_slf_attn_bias": + [(1, ModelHyperParams.n_head, (ModelHyperParams.max_length + 1), + (ModelHyperParams.max_length + 1)), "float32"], + # This shape input is used to reshape the output of embedding layer. + "src_data_shape": [(3L, ), "int32"], + # This shape input is used to reshape before softmax in self attention. + "src_slf_attn_pre_softmax_shape": [(2L, ), "int32"], + # This shape input is used to reshape after softmax in self attention. + "src_slf_attn_post_softmax_shape": [(4L, ), "int32"], + # The actual data shape of trg_word is: + # [batch_size * max_trg_len_in_batch, 1] + "trg_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], + # The actual data shape of trg_pos is: + # [batch_size * max_trg_len_in_batch, 1] + "trg_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], + # This input is used to remove attention weights on paddings and + # subsequent words in the decoder. + # The actual data shape of trg_slf_attn_bias is: + # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch] + "trg_slf_attn_bias": [(1, ModelHyperParams.n_head, + (ModelHyperParams.max_length + 1), + (ModelHyperParams.max_length + 1)), "float32"], + # This input is used to remove attention weights on paddings of the source + # input in the encoder-decoder attention. + # The actual data shape of trg_src_attn_bias is: + # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch] + "trg_src_attn_bias": [(1, ModelHyperParams.n_head, + (ModelHyperParams.max_length + 1), + (ModelHyperParams.max_length + 1)), "float32"], + # This shape input is used to reshape the output of embedding layer. + "trg_data_shape": [(3L, ), "int32"], + # This shape input is used to reshape before softmax in self attention. + "trg_slf_attn_pre_softmax_shape": [(2L, ), "int32"], + # This shape input is used to reshape after softmax in self attention. + "trg_slf_attn_post_softmax_shape": [(4L, ), "int32"], + # This shape input is used to reshape before softmax in encoder-decoder + # attention. + "trg_src_attn_pre_softmax_shape": [(2L, ), "int32"], + # This shape input is used to reshape after softmax in encoder-decoder + # attention. + "trg_src_attn_post_softmax_shape": [(4L, ), "int32"], + # This input is used in independent decoder program for inference. + # The actual data shape of enc_output is: + # [batch_size, max_src_len_in_batch, d_model] + "enc_output": [(1, (ModelHyperParams.max_length + 1), + ModelHyperParams.d_model), "float32"], + # The actual data shape of label_word is: + # [batch_size * max_trg_len_in_batch, 1] + "lbl_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], + # This input is used to mask out the loss of paddding tokens. + # The actual data shape of label_weight is: + # [batch_size * max_trg_len_in_batch, 1] + "lbl_weight": [(1 * (ModelHyperParams.max_length + 1), 1L), "float32"], +} + +# Names of position encoding table which will be initialized externally. +pos_enc_param_names = ( + "src_pos_enc_table", + "trg_pos_enc_table", ) +# separated inputs for different usages. +encoder_data_input_fields = ( + "src_word", + "src_pos", + "src_slf_attn_bias", ) +encoder_util_input_fields = ( + "src_data_shape", + "src_slf_attn_pre_softmax_shape", + "src_slf_attn_post_softmax_shape", ) +decoder_data_input_fields = ( + "trg_word", + "trg_pos", + "trg_slf_attn_bias", + "trg_src_attn_bias", + "enc_output", ) +decoder_util_input_fields = ( + "trg_data_shape", + "trg_slf_attn_pre_softmax_shape", + "trg_slf_attn_post_softmax_shape", + "trg_src_attn_pre_softmax_shape", + "trg_src_attn_post_softmax_shape", ) +label_data_input_fields = ( + "lbl_word", + "lbl_weight", ) diff --git a/vgg16/continuous_evaluation.py b/vgg16/continuous_evaluation.py new file mode 100644 index 00000000..24b09ddd --- /dev/null +++ b/vgg16/continuous_evaluation.py @@ -0,0 +1,22 @@ +""" +continuous_evaluation.py +""" +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import AccKpi +from kpi import CostKpi +from kpi import DurationKpi + +cifar10_128_train_speed_kpi = AccKpi('cifar10_128_train_speed', 0.02, 0, actived=True) +cifar10_128_gpu_memory_kpi = DurationKpi('cifar10_128_gpu_memory', 0.1, 0, actived=True) + +flowers_32_train_speed_kpi = AccKpi('flowers_32_train_speed', 0.02, 0, actived=True) +flowers_32_gpu_memory_kpi = DurationKpi('flowers_32_gpu_memory', 0.1, 0, actived=True) + +tracking_kpis = [ + cifar10_128_train_speed_kpi, + cifar10_128_gpu_memory_kpi, + flowers_32_train_speed_kpi, + flowers_32_gpu_memory_kpi, +] diff --git a/vgg16/get_gpu_data.py b/vgg16/get_gpu_data.py new file mode 100644 index 00000000..1e391253 --- /dev/null +++ b/vgg16/get_gpu_data.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +######################################################################## +# +# Copyright (c) 2018 Baidu.com, Inc. All Rights Reserved +# +######################################################################## +""" +File: get_gpu_data.py +Author: paddle(paddle@baidu.com) +Date: 2018/04/02 15:57:14 +""" +import argparse +from continuous_evaluation import tracking_kpis + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +args = parser.parse_args() + + +def save_gpu_data(): + mem_list = [] + with open('memory.txt', 'r') as f: + for i, data in enumerate(f.readlines()): + if i == 0: + continue + mem_list.append(int(data.split("\n")[0].split(" ")[0])) + gpu_memory_factor = None + for kpi in tracking_kpis: + if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size): + gpu_memory_kpi = kpi + gpu_memory_kpi.add_record(max(mem_list)) + gpu_memory_kpi.persist() + + +if __name__ == "__main__": + save_gpu_data() diff --git a/vgg16/latest_kpis/cifar10_128_gpu_memory_factor.txt b/vgg16/latest_kpis/cifar10_128_gpu_memory_factor.txt new file mode 100644 index 00000000..e6827254 --- /dev/null +++ b/vgg16/latest_kpis/cifar10_128_gpu_memory_factor.txt @@ -0,0 +1 @@ +[2198] diff --git a/vgg16/latest_kpis/cifar10_128_train_speed_factor.txt b/vgg16/latest_kpis/cifar10_128_train_speed_factor.txt new file mode 100644 index 00000000..49da8670 --- /dev/null +++ b/vgg16/latest_kpis/cifar10_128_train_speed_factor.txt @@ -0,0 +1 @@ +[735.5991821289062] diff --git a/vgg16/latest_kpis/flowers_32_gpu_memory_factor.txt b/vgg16/latest_kpis/flowers_32_gpu_memory_factor.txt new file mode 100644 index 00000000..590bed9c --- /dev/null +++ b/vgg16/latest_kpis/flowers_32_gpu_memory_factor.txt @@ -0,0 +1 @@ +[8938] diff --git a/vgg16/latest_kpis/flowers_32_train_speed_factor.txt b/vgg16/latest_kpis/flowers_32_train_speed_factor.txt new file mode 100644 index 00000000..b767a1cd --- /dev/null +++ b/vgg16/latest_kpis/flowers_32_train_speed_factor.txt @@ -0,0 +1 @@ +[51.00917434692383] diff --git a/vgg16/model.py b/vgg16/model.py new file mode 100644 index 00000000..fa5c25d5 --- /dev/null +++ b/vgg16/model.py @@ -0,0 +1,289 @@ +""" +VGG16 benchmark in Fluid +""" +from __future__ import print_function + +import sys +import time +import numpy as np +import commands +import subprocess +import threading +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import argparse +import functools + +from continuous_evaluation import tracking_kpis + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument( + "--gpu_id", + type=int, + default=3, + help="The GPU Card Id. (default: %(default)d)") +parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') +args = parser.parse_args() + + +def vgg16_bn_drop(input): + """ + vgg16_bn_drop + """ + + def conv_block(input, num_filter, groups, dropouts): + """ + conv_block + """ + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def main(): + """ + main + """ + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + opts = optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + # Initialize executor + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + # test + def test(exe): + """ + test + """ + test_accuracy = fluid.average.WeightedAverage() + for batch_id, data in enumerate(test_reader()): + img_data = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype( + "float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + acc, weight = exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_accuracy.add(value=acc, weight=weight) + return test_accuracy.eval() + + train_acc_kpi = None + for kpi in tracking_kpis: + if kpi.name == '%s_%s_train_acc' % (args.data_set, args.batch_size): + train_acc_kpi = kpi + train_speed_kpi = None + for kpi in tracking_kpis: + if kpi.name == '%s_%s_train_speed' % (args.data_set, args.batch_size): + train_speed_kpi = kpi + + iters, num_samples, start_time = 0, 0, time.time() + accuracy = fluid.average.WeightedAverage() + for pass_id in range(args.pass_num): + accuracy.reset() + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + img_data = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype( + "float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + loss, acc, weight = exe.run( + fluid.default_main_program(), + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor]) + accuracy.add(value=acc, weight=weight) + iters += 1 + num_samples += len(y_data) + if (batch_id % 10) == 0: + print( + "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss, acc) + ) # The accuracy is the accumulation of batches, but not the current batch. + + # pass_train_acc = accuracy.eval() + train_losses.append(loss) + train_accs.append(acc) + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + #train_acc_kpi.add_record(np.array(train_accs, dtype='float32')) + train_speed_kpi.add_record(np.array(examples_per_sec, dtype='float32')) + # evaluation + if args.with_test: + pass_test_acc = test(exe) + break + +#train_acc_kpi.persist() + train_speed_kpi.persist() + + +def print_arguments(): + """ + print_arguments + """ + print('----------- vgg Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def collect_gpu_memory_data(alive): + """ + collect the GPU memory data + """ + global is_alive + status, output = commands.getstatusoutput('rm -rf memory.txt') + if status == 0: + print('del memory.txt') + command = "nvidia-smi --id=%s --query-compute-apps=used_memory --format=csv -lms 1 > memory.txt" % args.gpu_id + p = subprocess.Popen(command, shell=True) + if p.pid < 0: + print('Get GPU memory data error') + while (is_alive): + time.sleep(1) + p.kill() + + +def save_gpu_data(mem_list): + gpu_memory_kpi = None + for kpi in tracking_kpis: + if kpi.name == '%s_%s_gpu_memory' % (args.data_set, args.batch_size): + gpu_memory_kpi = kpi + gpu_memory_kpi.add_record(max(mem_list)) + gpu_memory_kpi.persist() + + +if __name__ == "__main__": + print_arguments() + global is_alive + is_alive = True + collect_memory_thread = threading.Thread( + target=collect_gpu_memory_data, args=(is_alive, )) + collect_memory_thread.setDaemon(True) + collect_memory_thread.start() + main() + is_alive = False diff --git a/vgg16/run.xsh b/vgg16/run.xsh new file mode 100755 index 00000000..7984ae3f --- /dev/null +++ b/vgg16/run.xsh @@ -0,0 +1,19 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +cudaid=${vgg16_cudaid:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + + +#cifar10 128 +FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=128 --data_set=cifar10 --iterations=300 --gpu_id=$cudaid +python get_gpu_data.py --batch_size=128 --data_set=cifar10 + +#flowers 32 +FLAGS_benchmark=true FLAGS_fraction_of_gpu_memory_to_use=0.0 python model.py --device=GPU --batch_size=32 --data_set=flowers --iterations=100 --gpu_id=$cudaid +python get_gpu_data.py --batch_size=32 --data_set=flowers +for pid in $(ps -ef | grep nvidia-smi | grep -v grep | cut -c 9-15); do + echo $pid + kill -9 $pid +done diff --git a/vgg16_aws_dist/__init__.py b/vgg16_aws_dist/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vgg16_aws_dist/ce_runner.py b/vgg16_aws_dist/ce_runner.py new file mode 100644 index 00000000..bbc19351 --- /dev/null +++ b/vgg16_aws_dist/ce_runner.py @@ -0,0 +1,350 @@ +import argparse +import logging +import sys, os +import numpy as np +import threading +import copy +import csv +from aws_runner.client.train_command import TrainCommand + +# for ce env ONLY + +sys.path.append(os.environ['ceroot']) +from continuous_evaluation import cluster_specs, kpis_map, generate_kpi_id, generate_cluster_id + +from aws_runner.client.abclient import Abclient + +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + +parser = argparse.ArgumentParser(description=__doc__) + +parser.add_argument( + '--key_name', type=str, default="", help="required, key pair name") +parser.add_argument( + '--security_group_id', + type=str, + default="", + help="required, the security group id associated with your VPC") + +parser.add_argument( + '--vpc_id', + type=str, + default="", + help="The VPC in which you wish to run test") +parser.add_argument( + '--subnet_id', + type=str, + default="", + help="The Subnet_id in which you wish to run test") + +parser.add_argument( + '--pserver_instance_type', + type=str, + default="c5.2xlarge", + help="your pserver instance type, c5.2xlarge by default") +parser.add_argument( + '--trainer_instance_type', + type=str, + default="p2.8xlarge", + help="your trainer instance type, p2.8xlarge by default") + +parser.add_argument( + '--task_name', + type=str, + default="", + help="the name you want to identify your job") + +parser.add_argument( + '--pserver_image_id', + type=str, + default="ami-da2c1cbf", + help="ami id for system image, default one has nvidia-docker ready, \ + use ami-1ae93962 for us-east-2") + +parser.add_argument( + '--pserver_command', + type=str, + default="", + help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes" +) + +parser.add_argument( + '--trainer_image_id', + type=str, + default="ami-da2c1cbf", + help="ami id for system image, default one has nvidia-docker ready, \ + use ami-1ae93962 for us-west-2") + +parser.add_argument( + '--trainer_command', + type=str, + default="", + help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes" +) + +parser.add_argument( + '--availability_zone', + type=str, + default="us-east-2a", + help="aws zone id to place ec2 instances") + +parser.add_argument( + '--action', type=str, default="create", help="create|cleanup|status") + +parser.add_argument('--pem_path', type=str, help="private key file") + +parser.add_argument( + '--pserver_port', type=str, default="5436", help="pserver port") + +parser.add_argument( + '--docker_image', type=str, default="busybox", help="training docker image") + +parser.add_argument( + '--master_server_port', type=int, default=5436, help="master server port") + +parser.add_argument( + '--master_server_public_ip', type=str, help="master server public ip") + +parser.add_argument( + '--master_docker_image', + type=str, + default="putcn/paddle_aws_master:latest", + help="master docker image id") + +parser.add_argument( + '--no_clean_up', + type=str2bool, + default=False, + help="whether to clean up after training") + +parser.add_argument( + '--online_mode', + type=str2bool, + default=False, + help="is client activly stays online") + +args = parser.parse_args() +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + +class DataCollector(object): + _instance_store = {} + @classmethod + def get_instance_by_spec(cls, cluster_spec): + cluster_id = generate_cluster_id(cluster_spec) + if cluster_id not in cls._instance_store: + cls._instance_store[cluster_id] = cls(cluster_spec) + return cls._instance_store[cluster_id] + @classmethod + def persist_all(cls): + for _, collector in cls._instance_store.iteritems(): + collector.persist() + @classmethod + def generate_csv(cls): + with open("report.csv", "w") as csvfile: + fieldnames = [] + rows = [] + for cluster_id, collector in cls._instance_store.iteritems(): + row = { + "cluster_spec": cluster_id + } + for metric_name, _ in collector.store.iteritems(): + if metric_name not in fieldnames: + fieldnames.append(metric_name) + row[metric_name] = collector.avg(metric_name) + rows.append(row) + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for row in rows: + writer.writerow(row) + def __init__(self, cluster_spec): + self.store = {} + self.metric_data_identifier = "**metrics_data: " + self.cluster_spec = cluster_spec + self.cluster_id = generate_cluster_id(cluster_spec) + def log_processor(self, source, log_type): + for msg in iter(source.readline, ""): + logging.info(self.cluster_id) + logging.info(msg) + if (msg.startswith(self.metric_data_identifier)): + logging.info("metric data found, parse and save it") + str_msg = msg.replace(self.metric_data_identifier, "") + metrics_raw = str_msg.split(",") + for metric in metrics_raw: + metric_data = metric.split("=") + self.save(metric_data[0], metric_data[1]) + def save(self, key, val): + key = key.strip() + if isinstance(val, str): + val = val.strip() + if (key not in self.store): + self.store[key] = [] + logging.info("going to save " + key + "=" + str(val) + "from " + self.cluster_id) + self.store[key].append(float(val)) + def get(self, key): + if (key in self.store): + return self.store[key] + return None + def avg(self, key): + vals = self.get(key) + if vals is None: + return None + return sum(vals)/float(len(vals)) + def persist(self): + for metric_name, _ in self.store.iteritems(): + kpi_id = generate_kpi_id(metric_name, self.cluster_spec) + logging.info("going to persist kpi " + kpi_id) + if kpi_id in kpis_map: + kpi_instance = kpis_map[kpi_id] + kpi_instance.add_record(np.array(self.avg(metric_name), dtype='float32')) + kpi_instance.persist() + logging.info("done persisting kpi " + kpi_id) + else: + logging.info("no such kpi id found in map!!!") + logging.info(kpi_id) + +def train_with_spec(spec, args, lock): + logging.info("updating cluster config and starting client") + test_name = spec[0] + batch_size = spec[1] + args.trainer_count = spec[2] + gpus_per_trainer_count = spec[3] + args.pserver_count = spec[4] + trainer_command = TrainCommand(args.trainer_command) + + command_to_update = { + "model": test_name, + "batch_size": str(batch_size), + "gpus": str(gpus_per_trainer_count), + } + + if args.pserver_count == 0 and args.trainer_count == 1: + command_to_update["update_method"] = "local" + ''' not yet supported because aws runner can't provide PADDLE_TRAINER_IPS + if args.pserver_count == 0 and args.trainer_count > 1: + command_to_update["update_method"] = "nccl2" + ''' + + trainer_command.update(command_to_update) + args.trainer_command = trainer_command.unparse() + args.pserver_command = args.trainer_command + + data_collector = DataCollector.get_instance_by_spec(spec) + + logging.info(args) + abclient = Abclient(args, data_collector.log_processor, lock) + abclient.create() + +''' +ClusterIterator relies on spec structure as follows + batch_size, trainer_count, gpus_per_trainer_count, pserver_count + cluster_specs = [ + [64, 1, 1, 0], + [64, 8, 1, 8], + [64, 16, 1, 8], + [64, 32, 1, 8], + ] + it will sequentially distribute specs into chunks and make sure each chunk + does not exceeds trainer and pserver count limit + above specs will be distributed into 2 chunks +[[64, 1, 1, 0], [64, 8, 1, 8]] +and +[[64, 16, 1, 8]] + +[64, 32, 1, 8] itself does not fit in a single chunk, thus gets discard + +''' +class ClusterIterator: + def __init__(self, specs, trainer_count_threshold = 32, pserver_count_threshold = 10): + self.specs = specs + self.trainer_count_threshold = trainer_count_threshold + self.pserver_count_threshold = pserver_count_threshold + self.bad_specs = [] + def __iter__(self): + return self + def spec_can_not_fit(self, trainer_count, pserver_count): + return (trainer_count > self.trainer_count_threshold or pserver_count > self.pserver_count_threshold) + def next(self): + specs_to_ret = [] + trainer_count = 0 + pserver_count = 0 + if len(self.specs) == 0: + raise StopIteration() + else: + while len(self.specs) != 0: + next_spec = self.specs[0] + # when single spec can't even fit, move it to bad spec list + if self.spec_can_not_fit(next_spec[2], next_spec[4]): + self.bad_specs.append(self.specs.pop(0)) + continue + trainer_count += next_spec[2] + pserver_count += next_spec[4] + if self.spec_can_not_fit(trainer_count, pserver_count): + break + specs_to_ret.append(self.specs.pop(0)) + if len(specs_to_ret) == 0: + if len(self.bad_specs) != 0: + logging.info("%d specs not be able to fit in any test chunk" % len(self.bad_specs)) + raise StopIteration() + return specs_to_ret + +if __name__ == "__main__": + print_arguments() + if args.action == "create": + lock = threading.Lock() + cluster_specs_origin = copy.copy(cluster_specs) + for specs in ClusterIterator(cluster_specs): + logging.info("starting a new chunk of test") + testing_threads = [] + for cluster_spec in specs: + logging.info("creating cluster thread with spec") + logging.info(cluster_spec) + thread = threading.Thread( + target=train_with_spec, + args=(cluster_spec, copy.copy(args), lock,) + ) + testing_threads.append(thread) + + for testing_thread in testing_threads: + testing_thread.start() + + for testing_thread in testing_threads: + testing_thread.join() + logging.info("testing chunk ended") + + logging.info("all testing ended") + + # generate speedup rate + # 0 spec is the baseline + def get_speed_and_collector_by_spec(spec): + data_collector = DataCollector.get_instance_by_spec(spec) + return data_collector.avg("train_speed"), data_collector + + logging.info("generating speedup") + + # base_speed supposed to be one trainer, one gpu, local mode + base_speed, _ = get_speed_and_collector_by_spec(cluster_specs_origin[0]) + if base_speed is not None: + logging.info("base speed is %f" % base_speed) + if base_speed is not None: + for cluster_spec in cluster_specs_origin: + speed, data_collector = get_speed_and_collector_by_spec(cluster_spec) + if speed is not None: + # speed * trainer_count / base_speed + data_collector.save("speedup", speed*cluster_spec[2]/base_speed) + else: + logging.info("base speed is not available") + + DataCollector.persist_all() + # DataCollector.generate_csv() + diff --git a/vgg16_aws_dist/continuous_evaluation.py b/vgg16_aws_dist/continuous_evaluation.py new file mode 100644 index 00000000..dea1aa0e --- /dev/null +++ b/vgg16_aws_dist/continuous_evaluation.py @@ -0,0 +1,44 @@ +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import LessWorseKpi, GreaterWorseKpi + +kpis_specs = { + "speedup": [LessWorseKpi, 0.01], + "train_speed":[LessWorseKpi, 0.01], + # "converge_speed":[GreaterWorseKpi, 0.01], + # "gpu_memory":[GreaterWorseKpi, 0.01], + # "acc_4passes":[GreaterWorseKpi, 0.01], +} + +# each row represets a cluster setting with the following columns +# test_name, batch_size, trainer_count, gpus_per_trainer_count, pserver_count +# disable production cluster config for now +# cluster_specs = [ +# ["mnist", 64, 1, 1, 0], +# ["mnist", 64, 8, 1, 8], +# ["mnist", 64, 16, 1, 8], +# ["mnist", 64, 32, 1, 8], +# ] + +cluster_specs = [ + ["vgg", 16, 1, 1, 0], + ["vgg", 16, 4, 4, 4], + ["vgg", 16, 7, 8, 7], +] + +kpis_map = {} + +tracking_kpis = [] + +def generate_cluster_id(cluster_spec): + return "_".join(map(str, cluster_spec)) +def generate_kpi_id(kpi_name, cluster_spec): + return kpi_name + "_" + generate_cluster_id(cluster_spec) + +for kpi_type_name, (Kpi_class, diff_thre) in kpis_specs.items(): + for cluster_spec in cluster_specs: + kpi_id = generate_kpi_id(kpi_type_name, cluster_spec) + the_kpi = Kpi_class(kpi_id, diff_thre) + tracking_kpis.append(the_kpi) + kpis_map[kpi_id] = the_kpi \ No newline at end of file diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/Dockerfile b/vgg16_aws_dist/fluid_benchmark_for_aws/Dockerfile new file mode 100644 index 00000000..bef80bb6 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/Dockerfile @@ -0,0 +1,7 @@ +FROM paddlepaddlece/paddle:latest + +ENV HOME /root +COPY ./ /root/ +WORKDIR /root +RUN apt install -y python-opencv +ENTRYPOINT ["python", "fluid_benchmark.py"] \ No newline at end of file diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/README.md b/vgg16_aws_dist/fluid_benchmark_for_aws/README.md new file mode 100644 index 00000000..357ce932 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/README.md @@ -0,0 +1,73 @@ +# Fluid Benchmark + +Originally from https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid + +This directory contains several models configurations and tools that used to run +Fluid benchmarks for local and distributed training. + + +## Run the Benchmark + +To start, run the following command to get the full help message: + +```bash +python fluid_benchmark.py --help +``` + +Currently supported `--model` argument include: + +* mnist +* resnet + * you can chose to use different dataset using `--data_set cifar10` or + `--data_set flowers`. +* vgg +* stacked_dynamic_lstm +* machine_translation + +* Run the following command to start a benchmark job locally: + ```bash + python fluid_benchmark.py --model mnist --device GPU + ``` + You can choose to use GPU/CPU training. With GPU training, you can specify + `--gpus ` to run multi GPU training. +* Run distributed training with parameter servers: + * start parameter servers: + ```bash + PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver + ``` + * start trainers: + ```bash + PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver + ``` +* Run distributed training using NCCL2 + ```bash + PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2 + ``` + +## Run Distributed Benchmark on Kubernetes Cluster + +We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit +distributed benchmark jobs to your cluster. To generate a job yaml, just run: + +```bash +python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver +``` + +Then the yaml files are generated under directory `myjob`, you can run: + +```bash +kubectl create -f myjob/ +``` + +The job shall start. + + +## Notes for Run Fluid Distributed with NCCL2 and RDMA + +Before running NCCL2 distributed jobs, please check that whether your node has multiple network +interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual +network device. + +To run high-performance distributed training, you must prepare your hardware environment to be +able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md) +note for details. diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/fluid_benchmark.py b/vgg16_aws_dist/fluid_benchmark_for_aws/fluid_benchmark.py new file mode 100644 index 00000000..0f780a49 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/fluid_benchmark.py @@ -0,0 +1,461 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import cProfile +import time +import os + +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler +import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + parser.add_argument( + '--learning_rate', + type=float, + default=0.001, + help='The minibatch size.') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_false', + help='If set, test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='If set ommit the actual read data operators.') + parser.add_argument( + '--profile', action='store_true', help='If set, profile a few steps.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + + parser.add_argument( + "--acc_target", default=0.6, type=float, help="trianing will be terminated when acc_target reaches") + + args = parser.parse_args() + return args + + +def append_nccl2_prepare(trainer_id): + if trainer_id >= 0: + # append gen_nccl_id at the end of startup program + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + port = os.getenv("PADDLE_PSERVER_PORT") + worker_ips = os.getenv("PADDLE_TRAINER_IPS") + worker_endpoints = [] + for ip in worker_ips.split(","): + worker_endpoints.append(':'.join([ip, port])) + num_trainers = len(worker_endpoints) + current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port + worker_endpoints.remove(current_endpoint) + + nccl_id_var = fluid.default_startup_program().global_block().create_var( + name="NCCLID", + persistable=True, + type=fluid.core.VarDesc.VarType.RAW) + fluid.default_startup_program().global_block().append_op( + type="gen_nccl_id", + inputs={}, + outputs={"NCCLID": nccl_id_var}, + attrs={ + "endpoint": current_endpoint, + "endpoint_list": worker_endpoints, + "trainer_id": trainer_id + }) + return nccl_id_var, num_trainers, trainer_id + else: + raise Exception("must set positive PADDLE_TRAINER_ID env variables for " + "nccl-based dist train.") + + +def dist_transpile(trainer_id): + if trainer_id < 0: + return None, None + + # the port of all pservers, needed by both trainer and pserver + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + # comma separated ips of all pservers, needed by trainer and + # pserver + pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) + # total number of workers/trainers in the job, needed by + # trainer and pserver + trainers = int(os.getenv("PADDLE_TRAINERS")) + # the IP of the local machine, needed by pserver only + current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port + # the role, should be either PSERVER or TRAINER + training_role = os.getenv("PADDLE_TRAINING_ROLE") + + t = distribute_transpiler.DistributeTranspiler() + t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + if training_role == "PSERVER": + pserver_program = t.get_pserver_program(current_endpoint) + pserver_startup_program = t.get_startup_program(current_endpoint, + pserver_program) + return pserver_program, pserver_startup_program + elif training_role == "TRAINER": + train_program = t.get_trainer_program() + return train_program, fluid.default_startup_program() + else: + raise ValueError( + 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' + ) + + +def test(exe, inference_program, test_reader, feeder, batch_acc): + accuracy_evaluator = fluid.metrics.Accuracy() + for batch_id, data in enumerate(test_reader()): + acc = exe.run(inference_program, + feed=feeder.feed(data), + fetch_list=[batch_acc]) + accuracy_evaluator.update(value=np.array(acc), weight=len(data)) + + return accuracy_evaluator.eval() + + +# TODO(wuyi): replace train, train_parallel, test functions with new trainer +# API once it is ready. +def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, batch_size_tensor, + args, train_prog, startup_prog): + if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER": + place = core.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + exe.run(train_prog) + return + + if args.use_fake_data: + raise Exception( + "fake data is not supported in single GPU test for now.") + + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup_prog) + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + feeder = fluid.DataFeeder(feed_var_list, place) + + acc_4passes = None + converge_speed = None + train_pass_acc = fluid.average.WeightedAverage() + fetch_list = [avg_loss] + if batch_acc is not None: + fetch_list.append(batch_acc) + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + train_losses = [] + train_pass_acc.reset() + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + outs = exe.run(train_prog, + feed=feeder.feed(data), + fetch_list=fetch_list) + iters += 1 + num_samples += len(data) + loss = outs[0] + if batch_acc is not None: + acc = np.mean(outs[1]).item() + train_pass_acc.add(value=acc, weight=len(data)) + else: + acc = None + train_losses.append(loss) + print("Pass: %d, Iter: %d, Loss: %f, acc %s\n" % + (pass_id, iters, np.mean(train_losses), str(acc))) + if converge_speed is None and args.acc_target and acc >= args.acc_target: + converge_speed = time.time() - start_time + print("converge_speed set with %f" % converge_speed) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + if batch_acc is not None: + pass_train_acc = train_pass_acc.eval() + else: + pass_train_acc = None + + if pass_id == 4 and batch_acc is not None: + print("acc_4passes set with %f" % pass_train_acc) + acc_4passes = float(pass_train_acc) + + output_metric_data(pass_id, examples_per_sec, pass_train_acc, acc_4passes, converge_speed) + + # evaluation + if not args.no_test and batch_acc != None: + pass_test_acc = test(exe, infer_prog, test_reader, feeder, + batch_acc) + print(", Test Accuracy: %f" % pass_test_acc) + print("\n") + # TODO(wuyi): add warmup passes to get better perf data. + exit(0) + + +# TODO(wuyi): replace train, train_parallel, test functions with new trainer +# API once it is ready. +def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, + batch_acc, batch_size_tensor, args, train_prog, startup_prog, nccl_id_var, + num_trainers, trainer_id): + feed_var_list = [ + var for var in train_prog.global_block().vars.itervalues() + if var.is_data + ] + # generate fake: + if args.use_fake_data: + for var in feed_var_list: + v = startup_prog.global_block().clone_variable(var) + var.persistable = True + v.persistable = True + + real_shape = list(var.shape) + real_shape[0] = args.batch_size / args.gpus + startup_prog.global_block().append_op( + outputs={"Out": v}, + type="fill_constant", + attrs={"shape": real_shape, + "value": 1.0, + "dtype": var.dtype}) + + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + if nccl_id_var and trainer_id == 0: + #FIXME(wuyi): wait other trainer to start listening + time.sleep(30) + + startup_exe = fluid.Executor(place) + startup_exe.run(startup_prog) + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + exe = fluid.ParallelExecutor( + True, + avg_loss.name, + exec_strategy=strategy, + num_trainers=num_trainers, + trainer_id=trainer_id) + + feeder = fluid.DataFeeder(feed_var_list, place) + acc_4passes = None + converge_speed = None + accuracy_evaluator = fluid.metrics.Accuracy() + fetch_list = [avg_loss.name] + if batch_acc is not None: + fetch_list.append(batch_acc.name) + start_time = time.time() + + for pass_id in range(args.pass_num): + num_samples = 0 + iters = 0 + pass_start_time = time.time() + accuracy_evaluator.reset() + for batch_id, data in enumerate(train_reader()): + if args.profile and pass_id == 0 and batch_id == 5: + profiler.start_profiler("All") + elif args.profile and pass_id == 0 and batch_id == 10: + profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id) + + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + if args.use_fake_data: + outs = exe.run(fetch_list) + else: + outs = exe.run(fetch_list, feed=feeder.feed(data)) + + if args.update_method == "pserver": + exe.bcast_params() + num_samples += len(data) + iters += 1 + + if batch_acc is not None: + acc = np.mean(outs[1]).item() + accuracy_evaluator.update(value=acc, weight=len(data)) + else: + acc = None + + if batch_id % 1 == 0: + print("Pass %d, batch %d, loss %s, acc %s" % + (pass_id, batch_id, np.mean(outs[0]), str(acc))) + if converge_speed is None and args.acc_target and acc >= args.acc_target: + converge_speed = time.time() - start_time + print("converge_speed set with %f" % converge_speed) + + pass_elapsed = time.time() - pass_start_time + examples_per_sec = num_samples / pass_elapsed + if batch_acc is not None: + pass_train_acc = accuracy_evaluator.eval() + else: + pass_train_acc = None + + if pass_id == 4 and batch_acc is not None: + print("acc_4passes set with %f" % pass_train_acc) + acc_4passes = float(pass_train_acc) + + output_metric_data(pass_id, examples_per_sec, pass_train_acc, acc_4passes, converge_speed) + + if not args.no_test and batch_acc != None: + test_acc = test(startup_exe, infer_prog, test_reader, feeder, + batch_acc) + print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) + exit(0) + +def output_metric_data(pass_id, examples_per_sec, pass_train_acc, acc_4passes, converge_speed): + msgs = [] + msgs.append("pass = %d" % pass_id) + msgs.append("train_speed = %f" % float(examples_per_sec)) + if isinstance(pass_train_acc, float): + msgs.append("train_accuracy = %f" % pass_train_acc) + if isinstance(acc_4passes, float): + msgs.append("acc_4passes = %f" % acc_4passes) + if isinstance(converge_speed, float): + msgs.append("converge_speed = %f" % converge_speed) + print("**metrics_data: " + ", ".join(msgs)) + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- resnet Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def main(): + args = parse_args() + print_arguments(args) + + # the unique trainer id, starting from 0, needed by trainer + # only + nccl_id_var, num_trainers, trainer_id = ( + None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1"))) + + if args.use_cprof: + pr = cProfile.Profile() + pr.enable() + model_def = __import__("models.%s" % args.model, fromlist=["models"]) + train_args = list(model_def.get_model(args)) + train_args.append(args) + # Run optimizer.minimize(avg_loss) + train_args[2].minimize(train_args[0]) + if args.memory_optimize: + fluid.memory_optimize(fluid.default_main_program()) + + if args.update_method == "pserver": + train_prog, startup_prog = dist_transpile(trainer_id) + if not train_prog: + raise Exception( + "Must configure correct environments to run dist train.") + train_args.extend([train_prog, startup_prog]) + if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER": + train_args.extend([nccl_id_var, num_trainers, trainer_id]) + train_parallel(*train_args) + train(*train_args) + exit(0) + + # for other update methods, use default programs + train_args.append(fluid.default_main_program()) + train_args.append(fluid.default_startup_program()) + + if args.update_method == "nccl2": + nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id) + if args.gpus == 1: + # NOTE: parallel executor use profiler interanlly + if args.use_nvprof and args.device == 'GPU': + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + train(*train_args) + else: + train(*train_args) + else: + if args.device == "CPU": + raise Exception("Only support GPU perf with parallel exe") + train_args.extend([nccl_id_var, num_trainers, trainer_id]) + train_parallel(*train_args) + + +if __name__ == "__main__": + main() diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/kube_gen_job.py b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_gen_job.py new file mode 100644 index 00000000..39ba207f --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_gen_job.py @@ -0,0 +1,191 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import copy +import argparse +import random +import os +from kube_templates import pserver, trainer, envs + + +def parse_args(): + parser = argparse.ArgumentParser(description='Generate dist job yamls.') + + parser.add_argument( + '--jobname', default="paddlejob", help='unique job name') + parser.add_argument( + '--cpu', default=1, type=int, help='CPU cores per trainer node') + parser.add_argument( + '--pscpu', default=1, type=int, help='CPU cores per pserver node') + parser.add_argument( + '--gpu', default=0, type=int, help='num of GPUs per node') + parser.add_argument( + '--image', + default="bootstrapper:5000/fluid_benchmark:gpu", + help='num of GPUs per node') + parser.add_argument( + '--pservers', default=1, type=int, help='num of pservers') + parser.add_argument( + '--trainers', default=1, type=int, help='num of trainers') + parser.add_argument('--memory', default=1, type=int, help='trainer memory') + parser.add_argument( + '--psmemory', default=1, type=int, help='pserver memory') + parser.add_argument( + '--port', default=30236, type=int, help='num of trainers') + parser.add_argument( + '--entry', default="python train.py", help='command to run') + parser.add_argument( + '--fluid', default=1, type=int, help='whether is fluid job') + parser.add_argument( + '--rdma', action='store_ture', help='whether mount rdma libs') + parser.add_argument( + '--disttype', + default="pserver", + type=str, + choices=['pserver', 'nccl2', 'local'], + help='pserver or nccl2 or local') + + args = parser.parse_args() + return args + + +def gen_job(): + ps = pserver + tn = trainer + args = parse_args() + + ps_container = ps["spec"]["template"]["spec"]["containers"][0] + tn_container = tn["spec"]["template"]["spec"]["containers"][0] + + if args.fluid == 1: + ps_container["command"] = \ + ["paddle_k8s", "start_fluid"] + tn_container["command"] = \ + ["paddle_k8s", "start_fluid"] + ps["metadata"]["name"] = args.jobname + "-pserver" + ps["spec"]["template"]["metadata"]["labels"][ + "paddle-job-pserver"] = args.jobname + tn["metadata"]["name"] = args.jobname + "-trainer" + tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname + + ps_container["image"] = args.image + tn_container["image"] = args.image + + ps_container["resources"]["requests"]["cpu"] = str(args.pscpu) + ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi" + ps_container["resources"]["limits"]["cpu"] = str(args.pscpu) + ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi" + + tn_container["resources"]["requests"]["cpu"] = str(args.cpu) + tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi" + tn_container["resources"]["limits"]["cpu"] = str(args.cpu) + tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi" + if args.gpu > 0: + tn_container["resources"]["requests"][ + "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu) + tn_container["resources"]["limits"][ + "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu) + + ps["spec"]["replicas"] = int(args.pservers) + tn["spec"]["parallelism"] = int(args.trainers) + tn["spec"]["completions"] = int(args.trainers) + ps_container["ports"][0]["name"] = "jobport-" + str(args.port) + ps_container["ports"][0]["containerPort"] = args.port + spreadport = random.randint(40000, 60000) + tn_container["ports"][0]["name"] = "spr-" + str(spreadport) + tn_container["ports"][0]["containerPort"] = spreadport + + envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) + envs.append({"name": "TRAINERS", "value": str(args.trainers)}) + envs.append({"name": "PSERVERS", "value": str(args.pservers)}) + envs.append({"name": "ENTRY", "value": args.entry}) + envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)}) + envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) + # NOTE: these directories below are cluster specific, please modify + # this settings before you run on your own cluster. + envs.append({ + "name": "LD_LIBRARY_PATH", + "value": + "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind" + }) + + volumes = [{ + "name": "nvidia-driver", + "hostPath": { + "path": "/usr/local/nvidia/lib64" + } + }] + volumeMounts = [{ + "mountPath": "/usr/local/nvidia/lib64", + "name": "nvidia-driver" + }] + + if args.rdma: + volumes.extend([{ + "name": "ibetc", + "hostPath": { + "path": "/etc/libibverbs.d" + } + }, { + "name": "iblibs", + "hostPath": { + "path": "/usr/local/rdma" + } + }, { + "name": "valgrind", + "hostPath": { + "path": "/usr/lib64/mlnx_ofed/valgrind" + } + }]) + volumeMounts.extend([{ + "mountPath": "/etc/libibverbs.d", + "name": "ibetc" + }, { + "mountPath": "/usr/local/rdma", + "name": "iblibs" + }, { + "mountPath": "/usr/lib64/mlnx_ofed/valgrind", + "name": "valgrind" + }]) + # append shm for NCCL2 + volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}}) + volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"}) + + tn["spec"]["template"]["spec"]["volumes"] = volumes + tn_container["volumeMounts"] = volumeMounts + + ps_container["env"] = envs + ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"}) + tn_container["env"] = envs + if args.disttype == "pserver": + tn_container["env"].append({ + "name": "TRAINING_ROLE", + "value": "TRAINER" + }) + elif args.disttype == "nccl2" or args.disttype == "local": + # NCCL2 have no training role, set to plain WORKER + tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"}) + + os.mkdir(args.jobname) + if args.disttype == "pserver": + with open("%s/pserver.yaml" % args.jobname, "w") as fn: + yaml.dump(ps, fn) + + with open("%s/trainer.yaml" % args.jobname, "w") as fn: + yaml.dump(tn, fn) + + +if __name__ == "__main__": + gen_job() diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/__init__.py b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/__init__.py new file mode 100644 index 00000000..2d09d940 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/__init__.py @@ -0,0 +1,66 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pserver import pserver +from trainer import trainer + +__all__ = ["pserver", "trainer", "envs"] + +envs = [ + # envs that don't need to change + { + "name": "GLOG_v", + "value": "0" + }, + { + "name": "GLOG_logtostderr", + "value": "1" + }, + { + "name": "TOPOLOGY", + "value": "" + }, + { + "name": "TRAINER_PACKAGE", + "value": "/workspace" + }, + { + "name": "PADDLE_INIT_NICS", + "value": "eth2" + }, + { + "name": "NAMESPACE", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.namespace" + } + } + }, + { + "name": "POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + } + }, + { + "name": "PADDLE_CURRENT_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + } + } +] diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/pserver.py b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/pserver.py new file mode 100644 index 00000000..b54982c8 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/pserver.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pserver = { + "apiVersion": "extensions/v1beta1", + "kind": "ReplicaSet", + "metadata": { + "name": "jobname-pserver" + }, + "spec": { + "replicas": 1, + "template": { + "metadata": { + "labels": { + "paddle-job-pserver": "jobname" + } + }, + "spec": { + "hostNetwork": True, + "imagePullSecrets": [{ + "name": "job-registry-secret" + }], + "containers": [{ + "name": "pserver", + "image": "", + "imagePullPolicy": "Always", + "ports": [{ + "name": "jobport-1", + "containerPort": 1 + }], + "env": [], + "command": ["paddle_k8s", "start_pserver"], + "resources": { + "requests": { + "memory": "10Gi", + "cpu": "4" + }, + "limits": { + "memory": "10Gi", + "cpu": "4" + } + } + }] + } + } + } +} diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/trainer.py b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/trainer.py new file mode 100644 index 00000000..b915d31e --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/kube_templates/trainer.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +trainer = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": "jobname-pserver" + }, + "spec": { + "parallelism": 4, + "completions": 4, + "template": { + "metadata": { + "labels": { + "paddle-job": "jobname" + } + }, + "spec": { + "hostNetwork": True, + "imagePullSecrets": [{ + "name": "job-registry-secret" + }], + "restartPolicy": "Never", + "containers": [{ + "name": "trainer", + "image": "", + "imagePullPolicy": "Always", + # to let container set rlimit + "securityContext": { + "privileged": True + # TODO(wuyi): use below specific cap instead of privileged, + # using privileged will cause all GPU device are visible + # in the container. + # "capabilities": { + # "add": ["SYS_RESOURCE"] + # } + }, + "ports": [{ + "name": "jobport-1", + "containerPort": 1 + }], + "env": [], + "command": ["paddle_k8s", "start_trainer", "v2"], + "resources": { + "requests": { + "memory": "10Gi", + "cpu": "4", + }, + "limits": { + "memory": "10Gi", + "cpu": "4", + } + } + }] + } + } + } +} diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/__init__.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/__init__.py new file mode 100644 index 00000000..1c3fcac8 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/machine_translation.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/machine_translation.py new file mode 100644 index 00000000..122a66c9 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/machine_translation.py @@ -0,0 +1,232 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import distutils.util + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, + decoder_boot, decoder_size): + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + concated = fluid.layers.concat( + input=[encoder_proj, decoder_state_expand], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + act='tanh', + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) + weigths_reshape = fluid.layers.reshape( + x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + rnn = fluid.layers.DynamicRNN() + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + with rnn.block(): + current_word = rnn.step_input(target_embedding) + encoder_vec = rnn.static_input(encoder_vec) + encoder_proj = rnn.static_input(encoder_proj) + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) + cell_mem = rnn.memory(init=cell_init) + context = simple_attention(encoder_vec, encoder_proj, hidden_mem) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) + rnn.update_memory(hidden_mem, h) + rnn.update_memory(cell_mem, c) + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + rnn.output(out) + return rnn() + + if not is_generating: + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, + encoded_proj, decoder_boot, + decoder_size) + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + + return avg_cost, feeding_list + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray + + +def get_model(args): + embedding_dim = 512 + encoder_size = 512 + decoder_size = 512 + dict_size = 30000 + beam_size = 3 + max_length = 250 + avg_cost, feeding_list = seq_to_seq_net( + embedding_dim, + encoder_size, + decoder_size, + dict_size, + dict_size, + False, + beam_size=beam_size, + max_length=max_length) + + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(dict_size), buf_size=1000), + batch_size=args.batch_size) + + return avg_cost, inference_program, optimizer, train_batch_generator, \ + test_batch_generator, None, None diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/mnist.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/mnist.py new file mode 100644 index 00000000..9606304b --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/mnist.py @@ -0,0 +1,94 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import cProfile + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler + +SEED = 1 +DTYPE = "float32" + +# random seed must set before configuring the network. +# fluid.default_startup_program().random_seed = SEED + + +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + # TODO(dzhwinter) : refine the initializer and random seed settting + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + return predict + + +def get_model(args): + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc, batch_size_tensor diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/resnet.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/resnet.py new file mode 100644 index 00000000..34748e37 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/resnet.py @@ -0,0 +1,161 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import numpy as np +import time + +import cProfile, pstats, StringIO + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler + + +def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): + conv1 = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv1, act=act) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + +def basicblock(input, ch_out, stride): + short = shortcut(input, ch_out, stride) + conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + +def bottleneck(input, ch_out, stride): + short = shortcut(input, ch_out * 4, stride) + conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) + conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) + return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') + + +def layer_warp(block_func, input, ch_out, count, stride): + res_out = block_func(input, ch_out, stride) + for i in range(1, count): + res_out = block_func(res_out, ch_out, 1) + return res_out + + +def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): + + cfg = { + 18: ([2, 2, 2, 1], basicblock), + 34: ([3, 4, 6, 3], basicblock), + 50: ([3, 4, 6, 3], bottleneck), + 101: ([3, 4, 23, 3], bottleneck), + 152: ([3, 8, 36, 3], bottleneck) + } + stages, block_func = cfg[depth] + conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) + pool1 = fluid.layers.pool2d( + input=conv1, pool_type='avg', pool_size=3, pool_stride=2) + res1 = layer_warp(block_func, pool1, 64, stages[0], 1) + res2 = layer_warp(block_func, res1, 128, stages[1], 2) + res3 = layer_warp(block_func, res2, 256, stages[2], 2) + res4 = layer_warp(block_func, res3, 512, stages[3], 2) + pool2 = fluid.layers.pool2d( + input=res4, + pool_size=7, + pool_type='avg', + pool_stride=1, + global_pooling=True) + out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') + return out + + +def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): + assert (depth - 2) % 6 == 0 + + n = (depth - 2) // 6 + + conv1 = conv_bn_layer( + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, n, 1) + res2 = layer_warp(basicblock, res1, 32, n, 2) + res3 = layer_warp(basicblock, res2, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') + return out + + +def get_model(args): + model = resnet_cifar10 + if args.data_set == "cifar10": + class_dim = 10 + if args.data_format == 'NCHW': + dshape = [3, 32, 32] + else: + dshape = [32, 32, 3] + model = resnet_cifar10 + else: + class_dim = 102 + if args.data_format == 'NCHW': + dshape = [3, 224, 224] + else: + dshape = [224, 224, 3] + model = resnet_imagenet + + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + predict = model(input, class_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc, batch_size_tensor diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/stacked_dynamic_lstm.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/stacked_dynamic_lstm.py new file mode 100644 index 00000000..bd44a607 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/stacked_dynamic_lstm.py @@ -0,0 +1,139 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import cPickle +import os +import random +import time + +import numpy +import paddle +import paddle.dataset.imdb as imdb +import paddle.fluid as fluid +import paddle.batch as batch +import paddle.fluid.profiler as profiler + +word_dict = imdb.word_dict() + + +def crop_sentence(reader, crop_size): + unk_value = word_dict[''] + + def __impl__(): + for item in reader(): + if len([x for x in item[0] if x != unk_value]) < crop_size: + yield item + + return __impl__ + + +def get_model(args): + lstm_size = 512 + emb_dim = 512 + crop_size = 1500 + + data = fluid.layers.data( + name="words", shape=[1], lod_level=1, dtype='int64') + sentence = fluid.layers.embedding( + input=data, size=[len(word_dict), emb_dim]) + + sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + word = rnn.step_input(sentence) + prev_hidden = rnn.memory(value=0.0, shape=[lstm_size]) + prev_cell = rnn.memory(value=0.0, shape=[lstm_size]) + + def gate_common( + ipt, + hidden, + size, ): + gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True) + gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False) + gate = fluid.layers.sums(input=[gate0, gate1]) + return gate + + forget_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + input_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + output_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + cell_gate = fluid.layers.tanh( + x=gate_common(word, prev_hidden, lstm_size)) + + cell = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul( + x=input_gate, y=cell_gate) + ]) + + hidden = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell)) + + rnn.update_memory(prev_cell, cell) + rnn.update_memory(prev_hidden, hidden) + rnn.output(hidden) + + last = fluid.layers.sequence_pool(rnn(), 'last') + logit = fluid.layers.fc(input=last, size=2, act='softmax') + loss = fluid.layers.cross_entropy( + input=logit, + label=fluid.layers.data( + name='label', shape=[1], dtype='int64')) + loss = fluid.layers.mean(x=loss) + + # add acc + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \ + shape=[1], dtype='int64'), total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + adam = fluid.optimizer.Adam() + + train_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), + batch_size=args.batch_size) + test_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000), + batch_size=args.batch_size) + + return loss, inference_program, adam, train_reader, test_reader, batch_acc, batch_size_tensor + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = numpy.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/models/vgg.py b/vgg16_aws_dist/fluid_benchmark_for_aws/models/vgg.py new file mode 100644 index 00000000..6571bbf6 --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/models/vgg.py @@ -0,0 +1,104 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in Fluid""" +from __future__ import print_function + +import sys +import time +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import argparse +import functools + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def get_model(args): + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc, batch_size_tensor diff --git a/vgg16_aws_dist/fluid_benchmark_for_aws/run.sh b/vgg16_aws_dist/fluid_benchmark_for_aws/run.sh new file mode 100644 index 00000000..f6dfd20b --- /dev/null +++ b/vgg16_aws_dist/fluid_benchmark_for_aws/run.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# This script benchmarking the PaddlePaddle Fluid on +# single thread single GPU. + +#export FLAGS_fraction_of_gpu_memory_to_use=0.0 +export CUDNN_PATH=/paddle/cudnn_v5 + +# disable openmp and mkl parallel +#https://github.com/PaddlePaddle/Paddle/issues/7199 +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` +if [ $ht -eq 1 ]; then # HT is OFF + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,0,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="FALSE" + fi +else # HT is ON + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,1,0" + fi +fi +# disable multi-gpu if have more than one +export CUDA_VISIBLE_DEVICES=0 +export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH + +# only query the gpu used +nohup stdbuf -oL nvidia-smi \ + --id=${CUDA_VISIBLE_DEVICES} \ + --query-gpu=timestamp \ + --query-compute-apps=pid,process_name,used_memory \ + --format=csv \ + --filename=mem.log \ + -l 1 & +# mnist +# mnist gpu mnist 128 +FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=500 \ + 2>&1 | tee -a mnist_gpu_128.log + +# vgg16 +# gpu cifar10 128 +FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a vgg16_gpu_128.log + +# flowers gpu 128 +FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ + --device=GPU \ + --batch_size=32 \ + --data_set=flowers \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a vgg16_gpu_flowers_32.log + +# resnet50 +# resnet50 gpu cifar10 128 +FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ + --device=GPU \ + --batch_size=128 \ + --data_set=cifar10 \ + --model=resnet_cifar10 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a resnet50_gpu_128.log + +# resnet50 gpu flowers 64 +FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ + --device=GPU \ + --batch_size=64 \ + --data_set=flowers \ + --model=resnet_imagenet \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a resnet50_gpu_flowers_64.log + +# lstm +# lstm gpu imdb 32 # tensorflow only support batch=32 +FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \ + --device=GPU \ + --batch_size=32 \ + --skip_batch_num=5 \ + --iterations=30 \ + --hidden_dim=512 \ + --emb_dim=512 \ + --crop_size=1500 \ + 2>&1 | tee -a lstm_gpu_32.log + +# seq2seq +# seq2seq gpu wmb 128 +FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a lstm_gpu_128.log diff --git a/vgg16_aws_dist/latest_kpis/speedup_rate_factor.txt b/vgg16_aws_dist/latest_kpis/speedup_rate_factor.txt new file mode 100644 index 00000000..edf5775a --- /dev/null +++ b/vgg16_aws_dist/latest_kpis/speedup_rate_factor.txt @@ -0,0 +1 @@ +[0.5] \ No newline at end of file diff --git a/vgg16_aws_dist/run.xsh b/vgg16_aws_dist/run.xsh new file mode 100755 index 00000000..be239834 --- /dev/null +++ b/vgg16_aws_dist/run.xsh @@ -0,0 +1,66 @@ +#!/bin/bash + +set -xe + +CURRENT_FILE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +PADDLE_PATH=$CURRENT_FILE_DIR/../../.. +paddle_build_path=$PADDLE_PATH/build +paddle_docker_hub_tag="paddlepaddlece/paddle:latest" +fluid_benchmark_dockerhub_tag="paddlepaddlece/fluid_benchmark:latest" +training_command="update_method:pserver,acc_target:0.6,iterations:100,pass_num:1" + +# clean up docker +docker system prune -f + +# loginto docker hub +# login is now performed in teamcity +# docker login -u $DOCKER_HUB_USERNAME -p $DOCKER_HUB_PASSWORD + +# create paddle docker image +echo "going to build and push paddle production image" +docker build -t $paddle_docker_hub_tag $paddle_build_path +docker push $paddle_docker_hub_tag + +# build test docker image +cd $CURRENT_FILE_DIR + +cd fluid_benchmark_for_aws +if [ -d ~/.cache/paddle/dataset/cifar ]; then + echo "host cifar dataset cache found, copying it to docker root" + mkdir -p .cache/paddle/dataset/ + cp -r -f ~/.cache/paddle/dataset/cifar .cache/paddle/dataset/ +fi + +if [ -d ~/.cache/paddle/dataset/flowers ]; then + echo "host flower dataset cache found, copying it to docker root" + mkdir -p .cache/paddle/dataset/ + cp -r -f ~/.cache/paddle/dataset/flowers .cache/paddle/dataset/ +fi + +cd .. + +echo "going to build fluid_benchmark_for_aws docker image and push it" +docker build -t $fluid_benchmark_dockerhub_tag ./fluid_benchmark_for_aws +docker push $fluid_benchmark_dockerhub_tag + +# fetch runner and install dependencies +echo "going to work with aws_runner" +if [ ! -d aws_runner ]; then + echo "no aws_runner found, cloning one" + git clone https://github.com/putcn/aws_runner.git +fi +cd aws_runner +git pull +cd .. +echo "going to install aws_runner dependencies" +pip install -r aws_runner/client/requirements.txt + +echo "going to start testing" +# start aws testingr +python ce_runner.py \ + --key_name aws_benchmark_us_east \ + --security_group_id sg-95539dff \ + --online_mode yes \ + --pserver_command $training_command \ + --trainer_command $training_command \ + --docker_image $fluid_benchmark_dockerhub_tag \ No newline at end of file diff --git a/vgg16_aws_dist/speedup_vgg_16_1_1_0_factor.txt b/vgg16_aws_dist/speedup_vgg_16_1_1_0_factor.txt new file mode 100644 index 00000000..e7a19a6e --- /dev/null +++ b/vgg16_aws_dist/speedup_vgg_16_1_1_0_factor.txt @@ -0,0 +1 @@ +[1.0] \ No newline at end of file diff --git a/vgg16_aws_dist/speedup_vgg_16_4_4_4_factor.txt b/vgg16_aws_dist/speedup_vgg_16_4_4_4_factor.txt new file mode 100644 index 00000000..3ea09272 --- /dev/null +++ b/vgg16_aws_dist/speedup_vgg_16_4_4_4_factor.txt @@ -0,0 +1 @@ +[10.233551979064941] \ No newline at end of file diff --git a/vgg16_aws_dist/speedup_vgg_16_7_8_7_factor.txt b/vgg16_aws_dist/speedup_vgg_16_7_8_7_factor.txt new file mode 100644 index 00000000..c3f822e5 --- /dev/null +++ b/vgg16_aws_dist/speedup_vgg_16_7_8_7_factor.txt @@ -0,0 +1 @@ +[11.316923141479492] \ No newline at end of file diff --git a/vgg16_aws_dist/train_speed_vgg_16_1_1_0_factor.txt b/vgg16_aws_dist/train_speed_vgg_16_1_1_0_factor.txt new file mode 100644 index 00000000..55d41345 --- /dev/null +++ b/vgg16_aws_dist/train_speed_vgg_16_1_1_0_factor.txt @@ -0,0 +1 @@ +[11.437457084655762] \ No newline at end of file diff --git a/vgg16_aws_dist/train_speed_vgg_16_4_4_4_factor.txt b/vgg16_aws_dist/train_speed_vgg_16_4_4_4_factor.txt new file mode 100644 index 00000000..c133cf2f --- /dev/null +++ b/vgg16_aws_dist/train_speed_vgg_16_4_4_4_factor.txt @@ -0,0 +1 @@ +[29.26145362854004] \ No newline at end of file diff --git a/vgg16_aws_dist/train_speed_vgg_16_7_8_7_factor.txt b/vgg16_aws_dist/train_speed_vgg_16_7_8_7_factor.txt new file mode 100644 index 00000000..b0991782 --- /dev/null +++ b/vgg16_aws_dist/train_speed_vgg_16_7_8_7_factor.txt @@ -0,0 +1 @@ +[18.49097442626953] \ No newline at end of file