Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"fix accuracy kernel bug" #5673

Merged
merged 2 commits into from
Nov 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions paddle/operators/accuracy_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License. */
#include <thrust/reduce.h>
#include "paddle/operators/accuracy_op.h"
#include "paddle/platform/cuda_helper.h"
#include "paddle/platform/gpu_info.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -73,26 +74,28 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {

int num_samples = static_cast<int>(inference->dims()[0]);
size_t infer_width = inference->dims()[1];
PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
// cudaMemset((void**)&correct_data, 0, sizeof(float));
auto stream = ctx.cuda_device_context().stream();
platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);

if (num_samples == 0) {
return;
}
cudaMemcpy(total_data, &num_samples, sizeof(int), cudaMemcpyHostToDevice);
platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int),
cudaMemcpyHostToDevice, stream);

AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
AccuracyCudaKernel<
PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
num_samples, infer_width, indices_data, label_data, correct_data,
accuracy_data);

int d_num_samples, d_num_correct;
float d_accuracy;
cudaMemcpy(&d_num_correct, correct_data, sizeof(int),
cudaMemcpyDeviceToHost);
cudaMemcpy(&d_num_samples, total_data, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&d_accuracy, accuracy_data, sizeof(float),
cudaMemcpyDeviceToHost);
platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int),
cudaMemcpyDeviceToHost, stream);
platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int),
cudaMemcpyDeviceToHost, stream);
platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float),
cudaMemcpyDeviceToHost, stream);
}
};

Expand Down
5 changes: 5 additions & 0 deletions paddle/platform/gpu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,5 +109,10 @@ void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
}

void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
}
} // namespace platform
} // namespace paddle
3 changes: 3 additions & 0 deletions paddle/platform/gpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ void GpuMemcpySync(void *dst, const void *src, size_t count,
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream);

//! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);

} // namespace platform
} // namespace paddle

Expand Down