Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Resnet50 for bf16 mixed precision #5316

Open
wants to merge 1 commit into
base: dev-static
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions PaddleCV/image_classification/scripts/train/Resnet50_bf16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash -ex

export FLAGS_conv_workspace_size_limit=4000 #MB
export FLAGS_cudnn_exhaustive_search=1
export FLAGS_cudnn_batchnorm_spatial_persistent=1

DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/"
DATA_FORMAT="NHWC"

USE_AMP_BF16=true
USE_PURE_BF16=false

USE_DALI=false
USE_ADDTO=true

if ${USE_ADDTO} ;then
export FLAGS_max_inplace_grad_add=8
fi

if ${USE_DALI}; then
export FLAGS_fraction_of_gpu_memory_to_use=0.8
fi

python3.7 train.py \
--model=ResNet50 \
--num_epochs=20 \
--data_dir=${DATA_DIR} \
--batch_size=256 \
--total_images=1281167 \
--image_shape 4 224 224 \
--class_dim=1000 \
--print_step=10 \
--model_save_dir=output/ \
--lr_strategy=piecewise_decay \
--scale_loss=128.0 \
--use_dynamic_loss_scaling=true \
--data_format=${DATA_FORMAT} \
--fuse_elewise_add_act_ops=true \
--fuse_bn_act_ops=true \
--fuse_bn_add_act_ops=true \
--enable_addto=${USE_ADDTO} \
--validate=true \
--is_profiler=false \
--profiler_path=profile/ \
--reader_thread=10 \
--reader_buf_size=4000 \
--use_dali=${USE_DALI} \
--lr=0.1 \
--use_amp_bf16=${USE_AMP_BF16} \
--use_pure_bf16=${USE_PURE_BF16}


17 changes: 13 additions & 4 deletions PaddleCV/image_classification/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@ def build_program(is_train, main_prog, startup_prog, args):
use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
use_pure_fp16=args.use_pure_fp16,
use_fp16_guard=True)
elif args.use_amp_bf16:
optimizer = paddle.static.amp.bf16.decorate_bf16(
optimizer,
amp_lists=paddle.static.amp.bf16.
AutoMixedPrecisionListsBF16(
custom_bf16_list={"conv2d"}),
use_bf16_guard=None,
use_pure_bf16=args.use_pure_bf16)

optimizer.minimize(avg_cost)
if args.use_ema:
Expand Down Expand Up @@ -220,10 +228,11 @@ def train(args):
#init model by checkpoint or pretrianed model.
init_model(exe, args, train_prog)

if args.use_amp:
optimizer.amp_init(place,
scope=paddle.static.global_scope(),
test_program=test_prog if args.validate else None)
if args.use_amp or args.use_amp_bf16:
optimizer.amp_init(
place,
scope=paddle.static.global_scope(),
test_program=test_prog if args.validate else None)

num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
if args.use_dali:
Expand Down
11 changes: 6 additions & 5 deletions PaddleCV/image_classification/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ def parse_args():
add_arg('fuse_bn_act_ops', bool, False, "Whether to use batch_norm and act fusion.")
add_arg('fuse_bn_add_act_ops', bool, True, "Whether to use batch_norm, elementwise_add and act fusion. This is only used for AMP training.")
add_arg('enable_addto', bool, False, "Whether to enable the addto strategy for gradient accumulation or not. This is only used for AMP training.")

add_arg('use_amp_bf16', bool, False, "Whether to enable mixed precision training with bf16." )
add_arg('use_pure_bf16', bool, False, "Whether to use the pure bf16 training." )

add_arg('use_label_smoothing', bool, False, "Whether to use label_smoothing")
add_arg('label_smoothing_epsilon', float, 0.1, "The value of label_smoothing_epsilon parameter")
#NOTE: (2019/08/08) temporary disable use_distill
Expand Down Expand Up @@ -538,7 +540,7 @@ def best_strategy_compiled(args,
"PaddlePaddle version 1.7.0 or higher is "
"required when you want to fuse batch_norm and activation_op.")
build_strategy.fuse_elewise_add_act_ops = args.fuse_elewise_add_act_ops

try:
build_strategy.fuse_bn_add_act_ops = args.fuse_bn_add_act_ops
except Exception as e:
Expand All @@ -548,9 +550,8 @@ def best_strategy_compiled(args,
try:
build_strategy.enable_addto = args.enable_addto
except Exception as e:
logger.info(
"PaddlePaddle 2.0-rc or higher is "
"required when you want to enable addto strategy.")
logger.info("PaddlePaddle 2.0-rc or higher is "
"required when you want to enable addto strategy.")

exec_strategy = fluid.ExecutionStrategy()

Expand Down