From 0e2ba0638eb590096b9ac98320b70b2056f29d0e Mon Sep 17 00:00:00 2001 From: Paddle CI Date: Tue, 29 May 2018 13:03:53 +0800 Subject: [PATCH] add multi card for text_classification --- text_classification/continuous_evaluation.py | 8 +++- .../lstm_pass_duration_card4_factor.txt | 1 + .../lstm_train_cost_card4_factor.txt | 1 + text_classification/run.xsh | 9 +++- text_classification/train.py | 46 +++++++++++++------ text_classification/utils.py | 18 +++----- 6 files changed, 55 insertions(+), 28 deletions(-) create mode 100644 text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt create mode 100644 text_classification/latest_kpis/lstm_train_cost_card4_factor.txt diff --git a/text_classification/continuous_evaluation.py b/text_classification/continuous_evaluation.py index 133a0d35..9d9c9240 100644 --- a/text_classification/continuous_evaluation.py +++ b/text_classification/continuous_evaluation.py @@ -10,4 +10,10 @@ lstm_train_cost_kpi = CostKpi('lstm_train_cost', 5, 0) lstm_pass_duration_kpi = DurationKpi('lstm_pass_duration', 0.02, 0, actived=True) -tracking_kpis = [lstm_train_cost_kpi, lstm_pass_duration_kpi] +lstm_train_cost_kpi_card4 = CostKpi('lstm_train_cost_card4', 0.2, 0) +lstm_pass_duration_kpi_card4 = DurationKpi('lstm_pass_duration_card4', 0.02, 0, actived=True) + +tracking_kpis = [ + lstm_train_cost_kpi, lstm_pass_duration_kpi, + lstm_train_cost_kpi_card4, lstm_pass_duration_kpi_card4, + ] diff --git a/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt b/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt new file mode 100644 index 00000000..bfd66206 --- /dev/null +++ b/text_classification/latest_kpis/lstm_pass_duration_card4_factor.txt @@ -0,0 +1 @@ +[17.750867716471355] \ No newline at end of file diff --git a/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt b/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt new file mode 100644 index 00000000..f8d4e66e --- /dev/null +++ b/text_classification/latest_kpis/lstm_train_cost_card4_factor.txt @@ -0,0 +1 @@ +[0.0030332264248281717] diff --git a/text_classification/run.xsh b/text_classification/run.xsh index 9f93ed3d..29c8faab 100755 --- a/text_classification/run.xsh +++ b/text_classification/run.xsh @@ -2,8 +2,13 @@ export MKL_NUM_THREADS=1 export OMP_NUM_THREADS=1 -cudaid=${text_classification:=0} # use 0-th card as default + +cudaid=${text_classification:=0} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --model lstm + +cudaid=${text_classification_m:=0,1,2,3} # use 0-th card as default export CUDA_VISIBLE_DEVICES=$cudaid #LSTM pass_num 15 -FLAGS_benchmark=true python train.py lstm +FLAGS_benchmark=true python train.py --model lstm --gpu_card_num 4 diff --git a/text_classification/train.py b/text_classification/train.py index b22001ea..dfb3f877 100644 --- a/text_classification/train.py +++ b/text_classification/train.py @@ -5,14 +5,24 @@ import paddle.fluid as fluid import paddle - +import argparse import utils from nets import bow_net from nets import cnn_net from nets import lstm_net from nets import gru_net -from continuous_evaluation import lstm_train_cost_kpi, lstm_pass_duration_kpi +from continuous_evaluation import * +fluid.default_startup_program().random_seed = 99 + +def parse_args(): + parser = argparse.ArgumentParser("text_classification model benchmark.") + parser.add_argument( + '--model', type=str, default="lstm", help='model to run.') + parser.add_argument( + '--gpu_card_num', type=int, default=1, help='gpu card num used.') + args = parser.parse_args() + return args def train(train_reader, word_dict, @@ -26,6 +36,7 @@ def train(train_reader, """ train network """ + args = parse_args() data = fluid.layers.data( name="words", shape=[1], dtype="int64", lod_level=1) @@ -34,7 +45,7 @@ def train(train_reader, if not parallel: cost, acc, prediction = network(data, label, len(word_dict)) else: - places = fluid.layers.get_places(device_count=2) + places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): cost, acc, prediction = network( @@ -76,20 +87,29 @@ def train(train_reader, print("pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost)) if pass_id == pass_num - 1: - lstm_train_cost_kpi.add_record(newest_avg_cost) - lstm_pass_duration_kpi.add_record(total_time / pass_num) + if args.gpu_card_num == 1: + lstm_train_cost_kpi.add_record(newest_avg_cost) + lstm_pass_duration_kpi.add_record(total_time / pass_num) + else: + lstm_train_cost_kpi_card4.add_record(newest_avg_cost) + lstm_pass_duration_kpi_card4.add_record(total_time / pass_num) + epoch_model = save_dirname + "/" + "epoch" + str(pass_id) fluid.io.save_inference_model(epoch_model, ["words", "label"], acc, exe) - lstm_train_cost_kpi.persist() - lstm_pass_duration_kpi.persist() - + if args.gpu_card_num == 1: + lstm_train_cost_kpi.persist() + lstm_pass_duration_kpi.persist() + else: + lstm_train_cost_kpi_card4.persist() + lstm_pass_duration_kpi_card4.persist() def train_net(): + args = parse_args() word_dict, train_reader, test_reader = utils.prepare_data( "imdb", self_dict=False, batch_size=128, buf_size=50000) - if sys.argv[1] == "bow": + if args.model == "bow": train( train_reader, word_dict, @@ -100,7 +120,7 @@ def train_net(): lr=0.002, pass_num=30, batch_size=128) - elif sys.argv[1] == "cnn": + elif args.model == "cnn": train( train_reader, word_dict, @@ -111,18 +131,18 @@ def train_net(): lr=0.01, pass_num=30, batch_size=4) - elif sys.argv[1] == "lstm": + elif args.model == "lstm": train( train_reader, word_dict, lstm_net, use_cuda=True, - parallel=False, + parallel=True, save_dirname="lstm_model", lr=0.05, pass_num=15, batch_size=4) - elif sys.argv[1] == "gru": + elif args.model == "gru": train( train_reader, word_dict, diff --git a/text_classification/utils.py b/text_classification/utils.py index bda95656..bff77d11 100644 --- a/text_classification/utils.py +++ b/text_classification/utils.py @@ -69,35 +69,29 @@ def prepare_data(data_type="imdb", if data_type == "imdb": train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=buf_size), + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) test_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.test(word_dict), buf_size=buf_size), + paddle.dataset.imdb.test(word_dict), batch_size=batch_size) elif data_type == "light_imdb": train_reader = paddle.batch( - paddle.reader.shuffle( - light_imdb.train(word_dict), buf_size=buf_size), + light_imdb.train(word_dict), batch_size=batch_size) test_reader = paddle.batch( - paddle.reader.shuffle( - light_imdb.test(word_dict), buf_size=buf_size), + light_imdb.test(word_dict), batch_size=batch_size) elif data_type == "tiny_imdb": train_reader = paddle.batch( - paddle.reader.shuffle( - tiny_imdb.train(word_dict), buf_size=buf_size), + tiny_imdb.train(word_dict), batch_size=batch_size) test_reader = paddle.batch( - paddle.reader.shuffle( - tiny_imdb.test(word_dict), buf_size=buf_size), + tiny_imdb.test(word_dict), batch_size=batch_size) else: raise RuntimeError("no such dataset")