From 55b2d3d03256f3623b5fc89b1c0ae52937e82770 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 1 Mar 2018 00:41:19 -0800 Subject: [PATCH 1/2] Add CPU time to the timeline. --- paddle/fluid/platform/device_tracer.cc | 24 ++++++++++++++++++++++-- paddle/fluid/platform/device_tracer.h | 12 +++++++++++- paddle/fluid/platform/profiler.cc | 23 +++++++++++++++++------ paddle/fluid/platform/profiler.h | 1 + paddle/fluid/platform/profiler.proto | 3 ++- tools/timeline.py | 15 +++++++++++---- 6 files changed, 64 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d69dd74b06216..88718d06e7574 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/string/printf.h" @@ -140,6 +141,13 @@ class DeviceTracerImpl : public DeviceTracer { correlations_[id] = anno; } + void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) { + std::lock_guard l(trace_mu_); + cpu_records_.push_back( + CPURecord{anno, start_ns, end_ns, + std::hash{}(std::this_thread::get_id())}); + } + void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id) { std::lock_guard l(trace_mu_); @@ -185,7 +193,6 @@ class DeviceTracerImpl : public DeviceTracer { proto::Profile profile_pb; profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); - std::map> event_times; for (const KernelRecord &r : kernel_records_) { if (correlations_.find(r.correlation_id) == correlations_.end()) { fprintf(stderr, "cannot relate a kernel activity\n"); @@ -197,7 +204,15 @@ class DeviceTracerImpl : public DeviceTracer { event->set_end_ns(r.end_ns); event->set_stream_id(r.stream_id); event->set_device_id(r.device_id); - event_times[event->name()].push_back(r.end_ns - r.start_ns); + } + + for (const CPURecord &r : cpu_records_) { + auto *event = profile_pb.add_events(); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_stream_id(r.thread_id); + event->set_device_id(-1); } std::string profile_str; google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str); @@ -242,6 +257,7 @@ class DeviceTracerImpl : public DeviceTracer { uint64_t start_ns_; uint64_t end_ns_; std::vector kernel_records_; + std::vector cpu_records_; std::unordered_map correlations_; CUpti_SubscriberHandle subscriber_; }; @@ -254,6 +270,8 @@ class DeviceTracerDummy : public DeviceTracer { void AddAnnotation(uint64_t id, const std::string &anno) {} + void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {} + void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id) {} @@ -285,5 +303,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; } void ClearCurAnnotation() { cur_annotation = nullptr; } +const char *CurAnnotation() { return cur_annotation; } + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 4438b9c1c4180..613af92075672 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -36,6 +36,12 @@ class DeviceTracer { uint32_t stream_id; uint32_t correlation_id; }; + struct CPURecord { + std::string name; + uint64_t start_ns; + uint64_t end_ns; + uint64_t thread_id; + }; virtual ~DeviceTracer() {} // Needs to be called once before use. @@ -48,6 +54,9 @@ class DeviceTracer { // human-readable annotations. virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; + virtual void AddCPURecords(const char* anno, uint64_t start_ns, + uint64_t end_ns) = 0; + // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. virtual void AddKernelRecords(uint64_t start, uint64_t end, @@ -67,6 +76,7 @@ DeviceTracer* GetDeviceTracer(); void SetCurAnnotation(const char* anno); // Clear the name after the operation is done. void ClearCurAnnotation(); - +// Current name of the operation being run in the thread. +const char* CurAnnotation(); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index a90a3ed590c0d..094f9224f719f 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#include +#include #include #include #ifdef PADDLE_WITH_CUDA @@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() { .count(); } +inline uint64_t PosixInNsec() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); +} + Event::Event(EventKind kind, std::string name, uint32_t thread_id, const DeviceContext* dev_ctx) : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { @@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); } -RecordEvent::RecordEvent(const std::string& name, - const DeviceContext* dev_ctx) { +RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) + : start_ns_(PosixInNsec()) { if (g_state == ProfilerState::kDisabled) return; dev_ctx_ = dev_ctx; name_ = name; @@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name, RecordEvent::~RecordEvent() { if (g_state == ProfilerState::kDisabled) return; + DeviceTracer* tracer = GetDeviceTracer(); + if (tracer) { + tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec()); + } ClearCurAnnotation(); PopEvent(name_, dev_ctx_); } @@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key, Mark("_stop_profiler_", nullptr); g_state = ProfilerState::kDisabled; + std::vector> all_events = GetAllEvents(); + ParseEvents(all_events, sorted_key); + ResetProfiler(); DeviceTracer* tracer = GetDeviceTracer(); if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) { tracer->Disable(); tracer->GenProfile(profile_path); } - - std::vector> all_events = GetAllEvents(); - ParseEvents(all_events, sorted_key); - ResetProfiler(); } void ParseEvents(std::vector>& events, diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index cb9bb8a8a602d..3542ce6cda87e 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -108,6 +108,7 @@ struct RecordEvent { ~RecordEvent(); + uint64_t start_ns_; // The device context is used by Event to get the current cuda stream. const DeviceContext* dev_ctx_; // Event name diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index bdd86a0440d2b..d1cc8516dad00 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -19,7 +19,8 @@ message Event { optional string name = 1; optional uint64 start_ns = 2; optional uint64 end_ns = 3; - optional uint32 device_id = 5; + // When positive, it represents gpu id. When -1, it represents CPU. + optional int32 device_id = 5; optional uint32 stream_id = 6; } diff --git a/tools/timeline.py b/tools/timeline.py index a6eb6bee5c147..70d5ee3426a2f 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -124,15 +124,22 @@ def _allocate_pids(self): if event.device_id not in self._devices: pid = self._allocate_pid() self._devices[event.device_id] = pid - self._chrome_trace.emit_pid("device:%s" % pid, pid) + if event.device_id >= 0: + self._chrome_trace.emit_pid("gpu:%s:stream:%d" % + (pid, event.stream_id), pid) + elif event.device_id == -1: + self._chrome_trace.emit_pid("cpu:thread_hash:%d" % + event.stream_id, pid) def _allocate_events(self): for event in self._profile_pb.events: pid = self._devices[event.device_id] args = {'name': event.name} - self._chrome_trace.emit_region( - event.start_ns, (event.end_ns - event.start_ns) / 1000000.0, - pid, 0, 'Op', event.name, args) + # TODO(panyx0718): Chrome tracing only handles ms. However, some + # ops takes micro-seconds. Hence, we keep the ns here. + self._chrome_trace.emit_region(event.start_ns, + (event.end_ns - event.start_ns) / + 1.0, pid, 0, 'Op', event.name, args) def generate_chrome_trace(self): self._allocate_pids() From f3cbfc021c59c0aa5b04eca4393f80dd178adb5b Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 1 Mar 2018 05:30:46 -0800 Subject: [PATCH 2/2] Add MEMCPY information --- paddle/fluid/platform/device_tracer.cc | 72 +++++++++++++++++++++++++- paddle/fluid/platform/device_tracer.h | 14 +++++ paddle/fluid/platform/dynload/cupti.h | 3 +- paddle/fluid/platform/profiler.proto | 6 ++- tools/timeline.py | 2 + 5 files changed, 94 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 88718d06e7574..265343573b3c8 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -55,6 +55,36 @@ uint64_t kAlignSize = 8; } \ } while (0) +std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) { + switch (kind) { + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: + return "MEMCPY_HtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: + return "MEMCPY_DtoH"; + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: + return "MEMCPY_HtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: + return "MEMCPY_AtoH"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA: + return "MEMCPY_AtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: + return "MEMCPY_AtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: + return "MEMCPY_DtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: + return "MEMCPY_DtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: + return "MEMCPY_HtoH"; + case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP: + return "MEMCPY_PtoP"; + case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT: + return "MEMCPY_FORCE_INT"; + default: + break; + } + return "MEMCPY"; +} + void EnableActivity() { // Device activity record is created when CUDA initializes, so we // want to enable it before cuInit() or any CUDA runtime call. @@ -111,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, kernel->correlationId); break; } + case CUPTI_ACTIVITY_KIND_MEMCPY: { + auto *memcpy = + reinterpret_cast(record); + tracer->AddMemRecords( + MemcpyKind( + static_cast(memcpy->copyKind)), + memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId, + memcpy->correlationId, memcpy->bytes); + break; + } + case CUPTI_ACTIVITY_KIND_MEMCPY2: { + auto *memcpy = + reinterpret_cast(record); + tracer->AddMemRecords( + MemcpyKind( + static_cast(memcpy->copyKind)), + memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId, + memcpy->correlationId, memcpy->bytes); + break; + } default: { break; } } } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { @@ -148,6 +198,13 @@ class DeviceTracerImpl : public DeviceTracer { std::hash{}(std::this_thread::get_id())}); } + void AddMemRecords(const std::string &name, uint64_t start_ns, + uint64_t end_ns, uint32_t device_id, uint32_t stream_id, + uint32_t correlation_id, uint64_t bytes) { + mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, + stream_id, correlation_id, bytes}); + } + void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id) { std::lock_guard l(trace_mu_); @@ -183,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer { CUPTI_CALL( dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); - CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_)); enabled_ = true; } @@ -214,6 +270,15 @@ class DeviceTracerImpl : public DeviceTracer { event->set_stream_id(r.thread_id); event->set_device_id(-1); } + for (const MemRecord &r : mem_records_) { + auto *event = profile_pb.add_events(); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_stream_id(r.stream_id); + event->set_device_id(r.device_id); + event->mutable_memcopy()->set_bytes(r.bytes); + } std::string profile_str; google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str); std::ofstream profile_f; @@ -257,6 +322,7 @@ class DeviceTracerImpl : public DeviceTracer { uint64_t start_ns_; uint64_t end_ns_; std::vector kernel_records_; + std::vector mem_records_; std::vector cpu_records_; std::unordered_map correlations_; CUpti_SubscriberHandle subscriber_; @@ -272,6 +338,10 @@ class DeviceTracerDummy : public DeviceTracer { void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {} + void AddMemRecords(const std::string &name, uint64_t start_ns, + uint64_t end_ns, uint32_t device_id, uint32_t stream_id, + uint32_t correlation_id, uint64_t bytes) {} + void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id) {} diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 613af92075672..23f7cdbdffc9f 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -42,6 +42,15 @@ class DeviceTracer { uint64_t end_ns; uint64_t thread_id; }; + struct MemRecord { + std::string name; + uint64_t start_ns; + uint64_t end_ns; + uint32_t device_id; + uint32_t stream_id; + uint32_t correlation_id; + uint64_t bytes; + }; virtual ~DeviceTracer() {} // Needs to be called once before use. @@ -54,6 +63,11 @@ class DeviceTracer { // human-readable annotations. virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; + virtual void AddMemRecords(const std::string& name, uint64_t start_ns, + uint64_t end_ns, uint32_t device_id, + uint32_t stream_id, uint32_t correlation_id, + uint64_t bytes) = 0; + virtual void AddCPURecords(const char* anno, uint64_t start_ns, uint64_t end_ns) = 0; diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index a79868c18c14b..c1bf88f8cb690 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -74,7 +74,8 @@ extern void *cupti_dso_handle; __macro(cuptiFinalize); \ __macro(cuptiSubscribe); \ __macro(cuptiUnsubscribe); \ - __macro(cuptiEnableCallback); + __macro(cuptiEnableCallback); \ + __macro(cuptiEnableDomain); CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index d1cc8516dad00..06db7ed63895d 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -15,13 +15,17 @@ limitations under the License. */ syntax = "proto2"; package paddle.platform.proto; +message MemCopy { optional uint64 bytes = 3; } + message Event { optional string name = 1; optional uint64 start_ns = 2; optional uint64 end_ns = 3; // When positive, it represents gpu id. When -1, it represents CPU. - optional int32 device_id = 5; + optional int64 device_id = 5; optional uint32 stream_id = 6; + + optional MemCopy memcopy = 7; } message Profile { diff --git a/tools/timeline.py b/tools/timeline.py index 70d5ee3426a2f..d1d1dae2bd85e 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -135,6 +135,8 @@ def _allocate_events(self): for event in self._profile_pb.events: pid = self._devices[event.device_id] args = {'name': event.name} + if event.memcopy.bytes > 0: + args = {'mem_bytes': event.memcopy.bytes} # TODO(panyx0718): Chrome tracing only handles ms. However, some # ops takes micro-seconds. Hence, we keep the ns here. self._chrome_trace.emit_region(event.start_ns,