Skip to content

Commit

Permalink
Add MEMCPY information
Browse files Browse the repository at this point in the history
  • Loading branch information
panyx0718 committed Mar 2, 2018
1 parent 55b2d3d commit f3cbfc0
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 3 deletions.
72 changes: 71 additions & 1 deletion paddle/fluid/platform/device_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,36 @@ uint64_t kAlignSize = 8;
} \
} while (0)

std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
switch (kind) {
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
return "MEMCPY_HtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
return "MEMCPY_DtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
return "MEMCPY_HtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
return "MEMCPY_AtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
return "MEMCPY_AtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
return "MEMCPY_AtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
return "MEMCPY_DtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
return "MEMCPY_DtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
return "MEMCPY_HtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
return "MEMCPY_PtoP";
case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
return "MEMCPY_FORCE_INT";
default:
break;
}
return "MEMCPY";
}

void EnableActivity() {
// Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call.
Expand Down Expand Up @@ -111,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
kernel->correlationId);
break;
}
case CUPTI_ACTIVITY_KIND_MEMCPY: {
auto *memcpy =
reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
tracer->AddMemRecords(
MemcpyKind(
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
memcpy->correlationId, memcpy->bytes);
break;
}
case CUPTI_ACTIVITY_KIND_MEMCPY2: {
auto *memcpy =
reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
tracer->AddMemRecords(
MemcpyKind(
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
memcpy->correlationId, memcpy->bytes);
break;
}
default: { break; }
}
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
Expand Down Expand Up @@ -148,6 +198,13 @@ class DeviceTracerImpl : public DeviceTracer {
std::hash<std::thread::id>{}(std::this_thread::get_id())});
}

void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes});
}

void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {
std::lock_guard<std::mutex> l(trace_mu_);
Expand Down Expand Up @@ -183,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
CUPTI_CALL(
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));

CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
enabled_ = true;
}
Expand Down Expand Up @@ -214,6 +270,15 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_stream_id(r.thread_id);
event->set_device_id(-1);
}
for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
}
std::string profile_str;
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
std::ofstream profile_f;
Expand Down Expand Up @@ -257,6 +322,7 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t start_ns_;
uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_;
Expand All @@ -272,6 +338,10 @@ class DeviceTracerDummy : public DeviceTracer {

void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}

void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {}

void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {}

Expand Down
14 changes: 14 additions & 0 deletions paddle/fluid/platform/device_tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ class DeviceTracer {
uint64_t end_ns;
uint64_t thread_id;
};
struct MemRecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint32_t device_id;
uint32_t stream_id;
uint32_t correlation_id;
uint64_t bytes;
};

virtual ~DeviceTracer() {}
// Needs to be called once before use.
Expand All @@ -54,6 +63,11 @@ class DeviceTracer {
// human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;

virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id,
uint64_t bytes) = 0;

virtual void AddCPURecords(const char* anno, uint64_t start_ns,
uint64_t end_ns) = 0;

Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/platform/dynload/cupti.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
__macro(cuptiFinalize); \
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback);
__macro(cuptiEnableCallback); \
__macro(cuptiEnableDomain);

CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);

Expand Down
6 changes: 5 additions & 1 deletion paddle/fluid/platform/profiler.proto
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@ limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;

message MemCopy { optional uint64 bytes = 3; }

message Event {
optional string name = 1;
optional uint64 start_ns = 2;
optional uint64 end_ns = 3;
// When positive, it represents gpu id. When -1, it represents CPU.
optional int32 device_id = 5;
optional int64 device_id = 5;
optional uint32 stream_id = 6;

optional MemCopy memcopy = 7;
}

message Profile {
Expand Down
2 changes: 2 additions & 0 deletions tools/timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ def _allocate_events(self):
for event in self._profile_pb.events:
pid = self._devices[event.device_id]
args = {'name': event.name}
if event.memcopy.bytes > 0:
args = {'mem_bytes': event.memcopy.bytes}
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
self._chrome_trace.emit_region(event.start_ns,
Expand Down

0 comments on commit f3cbfc0

Please sign in to comment.