Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add profiling information for inference example #8748

Merged
merged 5 commits into from
Mar 8, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions paddle/fluid/inference/io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ namespace paddle {
namespace inference {

void ReadBinaryFile(const std::string& filename, std::string& contents) {
VLOG(3) << "loading model from " << filename;
std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
inputfs.seekg(0, std::ios::end);
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
fin.seekg(0, std::ios::end);
contents.clear();
contents.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
inputfs.read(&contents[0], contents.size());
inputfs.close();
contents.resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&contents[0], contents.size());
fin.close();
}

bool IsPersistable(const framework::VarDesc* var) {
Expand Down Expand Up @@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
std::string program_desc_str;
VLOG(3) << "loading model from " << model_filename;
ReadBinaryFile(model_filename, program_desc_str);

std::unique_ptr<framework::ProgramDesc> main_program(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h"

DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_int32(batch_size, 1, "Batch size of input data");
DEFINE_int32(repeat, 1, "Running the inference program repeat times");

TEST(inference, image_classification) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
"--batch_size=1 --repeat=1";
}

LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
Expand All @@ -29,13 +32,11 @@ TEST(inference, image_classification) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc

int64_t batch_size = 1;

paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0].
SetupTensor<float>(input,
{batch_size, 3, 32, 32},
{FLAGS_batch_size, 3, 32, 32},
static_cast<float>(0),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
Expand All @@ -46,7 +47,9 @@ TEST(inference, image_classification) {
cpu_fetchs1.push_back(&output1);

// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims();

#ifdef PADDLE_WITH_CUDA
Expand All @@ -55,7 +58,9 @@ TEST(inference, image_classification) {
cpu_fetchs2.push_back(&output2);

// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims();

CheckError<float>(output1, output2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h"

DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_int32(batch_size, 1, "Batch size of input data");
DEFINE_int32(repeat, 1, "Running the inference program repeat times");

TEST(inference, recognize_digits) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
"--batch_size=1 --repeat=1";
}

LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
Expand All @@ -29,77 +32,39 @@ TEST(inference, recognize_digits) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc

int64_t batch_size = 1;

paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
SetupTensor<float>(input,
{batch_size, 1, 28, 28},
{FLAGS_batch_size, 1, 28, 28},
static_cast<float>(-1),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);

paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
for (auto is_combined : {false, true}) {
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);

// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
// Run inference on CPU
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();

#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);

// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
// Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();

CheckError<float>(output1, output2);
CheckError<float>(output1, output2);
#endif
}

TEST(inference, recognize_digits_combine) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}

LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;

// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc

paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
SetupTensor<float>(
input, {1, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);

paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);

// Run inference on CPU
TestInference<paddle::platform::CPUPlace, true>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();

#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);

// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, true>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();

CheckError<float>(output1, output2);
#endif
}
76 changes: 59 additions & 17 deletions paddle/fluid/inference/tests/test_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ limitations under the License. */
#include <time.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"

template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
Expand Down Expand Up @@ -87,31 +88,58 @@ void CheckError(paddle::framework::LoDTensor& output1,
EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
}

template <typename Place, bool IsCombined = false>
template <typename Place>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
const int repeat = 1,
const bool is_combined = false) {
// 1. Define place, executor, scope
auto place = Place();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();

// Profile the performance
paddle::platform::ProfilerState state;
if (paddle::platform::is_cpu_place(place)) {
state = paddle::platform::ProfilerState::kCPU;
} else {
#ifdef PADDLE_WITH_CUDA
state = paddle::platform::ProfilerState::kCUDA;
// The default device_id of paddle::platform::CUDAPlace is 0.
// Users can get the device_id using:
// int device_id = place.GetDeviceId();
paddle::platform::SetDeviceId(0);
#endif
}

// Enable the profiler
paddle::platform::EnableProfiler(state);

// 2. Initialize the inference_program and load parameters
std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
if (IsCombined) {
// All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest.
// The file names should be consistent with that used in Python API
// `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program = paddle::inference::Load(executor,
*scope,
dirname + "/" + prog_filename,
dirname + "/" + param_filename);
} else {
// Parameters are saved in separate files sited in the specified `dirname`.
inference_program = paddle::inference::Load(executor, *scope, dirname);
{
paddle::platform::RecordEvent record_event(
"init_program",
paddle::platform::DeviceContextPool::Instance().Get(place));

if (is_combined) {
// All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest.
// The file names should be consistent with that used in Python API
// `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program =
paddle::inference::Load(executor,
*scope,
dirname + "/" + prog_filename,
dirname + "/" + param_filename);
} else {
// Parameters are saved in separate files sited in the specified
// `dirname`.
inference_program = paddle::inference::Load(executor, *scope, dirname);
}
}

// 3. Get the feed_target_names and fetch_target_names
Expand All @@ -134,7 +162,21 @@ void TestInference(const std::string& dirname,
}

// 6. Run the inference program
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
{
// Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) {
paddle::platform::RecordEvent record_event(
"run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place));

executor.Run(*inference_program, scope, feed_targets, fetch_targets);
}
}

// Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
"profiler.txt");
paddle::platform::ResetProfiler();

delete scope;
}
2 changes: 1 addition & 1 deletion paddle/fluid/platform/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) {
}
#ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead.
// Generate some dummy events first to reduce the startup overhead.
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Expand Down