Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CPU time and MemCopy to the timeline. #8679

Merged
merged 2 commits into from
Mar 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 93 additions & 3 deletions paddle/fluid/platform/device_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ limitations under the License. */
#include <map>
#include <mutex>
#include <numeric>
#include <thread>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h"
Expand Down Expand Up @@ -54,6 +55,36 @@ uint64_t kAlignSize = 8;
} \
} while (0)

std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
switch (kind) {
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
return "MEMCPY_HtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
return "MEMCPY_DtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
return "MEMCPY_HtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
return "MEMCPY_AtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
return "MEMCPY_AtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
return "MEMCPY_AtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
return "MEMCPY_DtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
return "MEMCPY_DtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
return "MEMCPY_HtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
return "MEMCPY_PtoP";
case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
return "MEMCPY_FORCE_INT";
default:
break;
}
return "MEMCPY";
}

void EnableActivity() {
// Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call.
Expand Down Expand Up @@ -110,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
kernel->correlationId);
break;
}
case CUPTI_ACTIVITY_KIND_MEMCPY: {
auto *memcpy =
reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
tracer->AddMemRecords(
MemcpyKind(
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
memcpy->correlationId, memcpy->bytes);
break;
}
case CUPTI_ACTIVITY_KIND_MEMCPY2: {
auto *memcpy =
reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
tracer->AddMemRecords(
MemcpyKind(
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
memcpy->correlationId, memcpy->bytes);
break;
}
default: { break; }
}
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
Expand Down Expand Up @@ -140,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_[id] = anno;
}

void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
CPURecord{anno, start_ns, end_ns,
std::hash<std::thread::id>{}(std::this_thread::get_id())});
}

void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes});
}

void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {
std::lock_guard<std::mutex> l(trace_mu_);
Expand Down Expand Up @@ -175,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
CUPTI_CALL(
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));

CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
enabled_ = true;
}
Expand All @@ -185,7 +249,6 @@ class DeviceTracerImpl : public DeviceTracer {
proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_);
std::map<std::string, std::vector<uint64_t>> event_times;
for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n");
Expand All @@ -197,7 +260,24 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id);
event_times[event->name()].push_back(r.end_ns - r.start_ns);
}

for (const CPURecord &r : cpu_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.thread_id);
event->set_device_id(-1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use a const value to represent CPU instead of -1, like

const kCpuDeviceId = -1;

}
for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
}
std::string profile_str;
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
Expand Down Expand Up @@ -242,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t start_ns_;
uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_;
};
Expand All @@ -254,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {

void AddAnnotation(uint64_t id, const std::string &anno) {}

void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}

void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {}

void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {}

Expand Down Expand Up @@ -285,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }

void ClearCurAnnotation() { cur_annotation = nullptr; }

const char *CurAnnotation() { return cur_annotation; }

} // namespace platform
} // namespace paddle
26 changes: 25 additions & 1 deletion paddle/fluid/platform/device_tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,21 @@ class DeviceTracer {
uint32_t stream_id;
uint32_t correlation_id;
};
struct CPURecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint64_t thread_id;
};
struct MemRecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint32_t device_id;
uint32_t stream_id;
uint32_t correlation_id;
uint64_t bytes;
};

virtual ~DeviceTracer() {}
// Needs to be called once before use.
Expand All @@ -48,6 +63,14 @@ class DeviceTracer {
// human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;

virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id,
uint64_t bytes) = 0;

virtual void AddCPURecords(const char* anno, uint64_t start_ns,
uint64_t end_ns) = 0;

// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end,
Expand All @@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
void SetCurAnnotation(const char* anno);
// Clear the name after the operation is done.
void ClearCurAnnotation();

// Current name of the operation being run in the thread.
const char* CurAnnotation();
} // namespace platform
} // namespace paddle
3 changes: 2 additions & 1 deletion paddle/fluid/platform/dynload/cupti.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
__macro(cuptiFinalize); \
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback);
__macro(cuptiEnableCallback); \
__macro(cuptiEnableDomain);

CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);

Expand Down
23 changes: 17 additions & 6 deletions paddle/fluid/platform/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <iomanip>
#include <map>
#ifdef PADDLE_WITH_CUDA
Expand Down Expand Up @@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
.count();
}

inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}

Event::Event(EventKind kind, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
Expand Down Expand Up @@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
}

RecordEvent::RecordEvent(const std::string& name,
const DeviceContext* dev_ctx) {
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
: start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
name_ = name;
Expand All @@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,

RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return;
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
}
ClearCurAnnotation();
PopEvent(name_, dev_ctx_);
}
Expand Down Expand Up @@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;

std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer();
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
tracer->Disable();
tracer->GenProfile(profile_path);
}

std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
}

void ParseEvents(std::vector<std::vector<Event>>& events,
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/platform/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ struct RecordEvent {

~RecordEvent();

uint64_t start_ns_;
// The device context is used by Event to get the current cuda stream.
const DeviceContext* dev_ctx_;
// Event name
Expand Down
7 changes: 6 additions & 1 deletion paddle/fluid/platform/profiler.proto
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@ limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;

message MemCopy { optional uint64 bytes = 3; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why start from 3?


message Event {
optional string name = 1;
optional uint64 start_ns = 2;
optional uint64 end_ns = 3;
optional uint32 device_id = 5;
// When positive, it represents gpu id. When -1, it represents CPU.
optional int64 device_id = 5;
optional uint32 stream_id = 6;

optional MemCopy memcopy = 7;
}

message Profile {
Expand Down
17 changes: 13 additions & 4 deletions tools/timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,24 @@ def _allocate_pids(self):
if event.device_id not in self._devices:
pid = self._allocate_pid()
self._devices[event.device_id] = pid
self._chrome_trace.emit_pid("device:%s" % pid, pid)
if event.device_id >= 0:
self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
(pid, event.stream_id), pid)
elif event.device_id == -1:
self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
event.stream_id, pid)

def _allocate_events(self):
for event in self._profile_pb.events:
pid = self._devices[event.device_id]
args = {'name': event.name}
self._chrome_trace.emit_region(
event.start_ns, (event.end_ns - event.start_ns) / 1000000.0,
pid, 0, 'Op', event.name, args)
if event.memcopy.bytes > 0:
args = {'mem_bytes': event.memcopy.bytes}
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
self._chrome_trace.emit_region(event.start_ns,
(event.end_ns - event.start_ns) /
1.0, pid, 0, 'Op', event.name, args)

def generate_chrome_trace(self):
self._allocate_pids()
Expand Down