/
profiling.h
454 lines (369 loc) · 15.4 KB
/
profiling.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
/* Copyright 2024 Stanford University, NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// profiling infrastructure for Realm tasks, copies, etc.
#ifndef REALM_PROFILING_H
#define REALM_PROFILING_H
#include <limits.h>
#include <vector>
#include <set>
#include <map>
#include "realm/bytearray.h"
#include "realm/processor.h"
#include "realm/memory.h"
#include "realm/instance.h"
#include "realm/faults.h"
namespace Realm {
// through the wonders of templates, users should never need to work with
// these IDs directly
enum ProfilingMeasurementID {
PMID_OP_STATUS, // completion status of operation
PMID_OP_STATUS_ABNORMAL, // completion status only if abnormal
PMID_OP_BACKTRACE, // backtrace of a failed operation
PMID_OP_TIMELINE, // when task was ready, started, completed
PMID_OP_EVENT_WAITS, // intervals when operation is waiting on events
PMID_OP_PROC_USAGE, // processor used by task
PMID_OP_MEM_USAGE, // memories used by a copy
PMID_INST_STATUS, // "completion" status of an instance
PMID_INST_STATUS_ABNORMAL, // completion status only if abnormal
PMID_INST_ALLOCRESULT, // success/failure of instance allocation
PMID_INST_TIMELINE, // timeline for a physical instance
PMID_INST_MEM_USAGE, // memory and size used by an instance
PMID_PCTRS_CACHE_L1I, // L1 I$ performance counters
PMID_PCTRS_CACHE_L1D, // L1 D$ performance counters
PMID_PCTRS_CACHE_L2, // L2 D$ performance counters
PMID_PCTRS_CACHE_L3, // L3 D$ performance counters
PMID_PCTRS_IPC, // instructions/clocks performance counters
PMID_PCTRS_TLB, // TLB miss counters
PMID_PCTRS_BP, // branch predictor performance counters
PMID_OP_TIMELINE_GPU, // when a task was started and completed on the GPU
PMID_OP_SUBGRAPH_INFO, // identifying info for containing subgraph(s)
PMID_OP_FINISH_EVENT, // finish event for an operation
PMID_OP_COPY_INFO, // copy transfer details
// as the name suggests, this should always be last, allowing apps/runtimes
// sitting on top of Realm to use some of the ID space
PMID_REALM_LAST = 10000,
};
namespace ProfilingMeasurements {
struct OperationStatus {
static const ProfilingMeasurementID ID = PMID_OP_STATUS;
enum Result {
COMPLETED_SUCCESSFULLY,
COMPLETED_WITH_ERRORS,
RUNNING,
INTERRUPT_REQUESTED, // running, but a cancellation requested
TERMINATED_EARLY,
WAITING,
READY,
CANCELLED, // cancelled without being started
};
Result result;
int error_code;
ByteArray error_details;
};
struct OperationAbnormalStatus : public OperationStatus {
static const ProfilingMeasurementID ID = PMID_OP_STATUS_ABNORMAL;
};
struct OperationBacktrace {
static const ProfilingMeasurementID ID = PMID_OP_BACKTRACE;
Backtrace backtrace;
};
struct OperationTimeline {
static const ProfilingMeasurementID ID = PMID_OP_TIMELINE;
// all times reported in nanoseconds from the start of program execution
// on some node. This is necessary because clients can't know where the
// measurement times were recorded and therefore have no reference. There
// may be skews between the start times of different nodes.
typedef long long timestamp_t;
static const timestamp_t INVALID_TIMESTAMP = LLONG_MIN;
OperationTimeline() :
create_time(INVALID_TIMESTAMP),
ready_time(INVALID_TIMESTAMP),
start_time(INVALID_TIMESTAMP),
end_time(INVALID_TIMESTAMP),
complete_time(INVALID_TIMESTAMP)
{ }
timestamp_t create_time; // when was operation created?
timestamp_t ready_time; // when was operation ready to proceed?
timestamp_t start_time; // when did operation start?
timestamp_t end_time; // when did operation end (on processor)?
timestamp_t complete_time; // when was all work for operation complete?
inline void record_create_time(void);
inline void record_ready_time(void);
inline void record_start_time(void);
inline void record_end_time(void);
inline void record_complete_time(void);
inline bool is_valid(void) const;
};
struct OperationTimelineGPU {
static const ProfilingMeasurementID ID = PMID_OP_TIMELINE_GPU;
// all times reported in nanoseconds from the start of program execution
// on some node. This is necessary because clients can't know where the
// measurement times were recorded and therefore have no reference. There
// may be skews between the start times of different nodes.
typedef long long timestamp_t;
static const timestamp_t INVALID_TIMESTAMP = LLONG_MIN;
OperationTimelineGPU() :
start_time(INVALID_TIMESTAMP),
end_time(INVALID_TIMESTAMP)
{ }
timestamp_t start_time; // when was the GPU started?
timestamp_t end_time; // when was the GPU completed?
inline void record_start_time(void);
inline void record_end_time(void);
inline bool is_valid(void) const;
};
// records time intervals in which the operation was waiting on events
struct OperationEventWaits {
static const ProfilingMeasurementID ID = PMID_OP_EVENT_WAITS;
typedef long long timestamp_t;
static const timestamp_t INVALID_TIMESTAMP = LLONG_MIN;
struct WaitInterval {
timestamp_t wait_start; // when did the interval begin?
timestamp_t wait_ready; // when did the event trigger?
timestamp_t wait_end; // when did the interval actually end
Event wait_event; // which event was waited on
inline void record_wait_start(void);
inline void record_wait_ready(void);
inline void record_wait_end(void);
};
std::vector<WaitInterval> intervals;
};
// Track processor used for tasks
struct OperationProcessorUsage {
static const ProfilingMeasurementID ID = PMID_OP_PROC_USAGE;
Processor proc;
};
// Track memories used for copies
struct OperationMemoryUsage {
static const ProfilingMeasurementID ID = PMID_OP_MEM_USAGE;
Memory source;
Memory target;
size_t size;
};
// Track transfer details for copies
struct OperationCopyInfo {
static const ProfilingMeasurementID ID = PMID_OP_COPY_INFO;
// for each request create this
enum RequestType
{
FILL,
REDUCE,
COPY,
};
struct InstInfo {
std::vector<RegionInstance> src_insts; // src instances
std::vector<RegionInstance> dst_insts; // dst instances
RegionInstance src_indirection_inst; // src indirection instance (gather)
RegionInstance dst_indirection_inst; // dst indirection instance (scatter)
std::vector<FieldID> src_fields; // src fields
std::vector<FieldID> dst_fields; // dst fields
FieldID src_indirection_field; // field of indirection points
FieldID dst_indirection_field; // field of indirection points
RequestType request_type; // fill, reduce, copy
unsigned int num_hops; // num_hops for each request
};
std::vector<InstInfo> inst_info;
};
struct OperationFinishEvent {
static const ProfilingMeasurementID ID = PMID_OP_FINISH_EVENT;
Event finish_event;
};
struct OperationSubgraphInfo {
static const ProfilingMeasurementID ID = PMID_OP_SUBGRAPH_INFO;
// TODO: probably can do something more useful here
// finish events listed from inside out (i.e. [0] is immediately
// containing subgraph)
std::vector<Event> subgraph_finish_events;
};
// Track the status of an instance
struct InstanceStatus {
static const ProfilingMeasurementID ID = PMID_INST_STATUS;
enum Result {
AWAITING_ALLOCATION,
FAILED_ALLOCATION,
CANCELLED_ALLOCATION, // cancelled/poisoned before allocation
ALLOCATED,
DESTROYED_SUCCESSFULLY,
CORRUPTED,
MEMORY_LOST,
INSTANCE_COUNT_EXCEEDED,
};
Result result;
int error_code;
ByteArray error_details;
};
struct InstanceAbnormalStatus : public InstanceStatus {
static const ProfilingMeasurementID ID = PMID_INST_STATUS_ABNORMAL;
};
// simple boolean indicating whether or not allocation is expected to
// succeed
struct InstanceAllocResult {
static const ProfilingMeasurementID ID = PMID_INST_ALLOCRESULT;
bool success;
};
// Track the timeline of an instance
struct InstanceTimeline {
static const ProfilingMeasurementID ID = PMID_INST_TIMELINE;
// all times reported in nanoseconds from the start of program execution
// on some node. This is necessary because clients can't know where the
// measurement times were recorded and therefore have no reference. There
// may be skews between the start times of different nodes.
typedef unsigned long long timestamp_t;
static const timestamp_t INVALID_TIMESTAMP = 0;
RegionInstance instance;
timestamp_t create_time; // when was instance created?
timestamp_t ready_time; // when was instance ready for use?
timestamp_t delete_time; // when was the instance deleted?
inline void record_create_time(void);
inline void record_ready_time(void);
inline void record_delete_time(void);
};
// Track properties of an instance
struct InstanceMemoryUsage {
static const ProfilingMeasurementID ID = PMID_INST_MEM_USAGE;
RegionInstance instance;
Memory memory;
size_t bytes;
};
// Processor cache stats
template <ProfilingMeasurementID _ID>
struct CachePerfCounters {
static const ProfilingMeasurementID ID = _ID;
long long accesses;
long long misses;
};
typedef CachePerfCounters<PMID_PCTRS_CACHE_L1I> L1ICachePerfCounters;
typedef CachePerfCounters<PMID_PCTRS_CACHE_L1D> L1DCachePerfCounters;
typedef CachePerfCounters<PMID_PCTRS_CACHE_L2> L2CachePerfCounters;
typedef CachePerfCounters<PMID_PCTRS_CACHE_L3> L3CachePerfCounters;
// instructions/cycles
struct IPCPerfCounters {
static const ProfilingMeasurementID ID = PMID_PCTRS_IPC;
long long total_insts;
long long total_cycles;
long long fp_insts;
long long ld_insts;
long long st_insts;
long long br_insts;
};
struct TLBPerfCounters {
static const ProfilingMeasurementID ID = PMID_PCTRS_TLB;
long long inst_misses;
long long data_misses;
};
struct BranchPredictionPerfCounters {
static const ProfilingMeasurementID ID = PMID_PCTRS_BP;
long long total_branches;
long long taken_branches;
long long mispredictions;
};
};
class REALM_PUBLIC_API ProfilingRequest {
public:
ProfilingRequest(Processor _response_proc, Processor::TaskFuncID _response_task_id, int _priority = 0, bool _report_if_empty = false);
ProfilingRequest(const ProfilingRequest& to_copy);
~ProfilingRequest(void);
ProfilingRequest& operator=(const ProfilingRequest &rhs);
ProfilingRequest& add_user_data(const void *payload, size_t payload_size);
template <typename T>
ProfilingRequest &add_measurement(void);
ProfilingRequest &add_measurement(ProfilingMeasurementID measurement_id);
ProfilingRequest &add_measurements(const std::set<ProfilingMeasurementID>& measurement_ids);
template <typename S> static ProfilingRequest *deserialize_new(S &s);
protected:
friend class ProfilingMeasurementCollection;
template <typename S> friend bool serialize(S &s, const ProfilingRequest &pr);
Processor response_proc;
Processor::TaskFuncID response_task_id;
int priority;
bool report_if_empty;
ByteArray user_data;
std::set<ProfilingMeasurementID> requested_measurements;
};
// manages a set of profiling requests attached to a Realm operation
class REALM_PUBLIC_API ProfilingRequestSet {
public:
ProfilingRequestSet(void);
ProfilingRequestSet(const ProfilingRequestSet& to_copy);
~ProfilingRequestSet(void);
ProfilingRequestSet& operator=(const ProfilingRequestSet &rhs);
ProfilingRequest& add_request(Processor response_proc,
Processor::TaskFuncID response_task_id,
const void *payload = 0, size_t payload_size = 0,
int priority = 0,
bool report_if_empty = false);
size_t request_count(void) const;
bool empty(void) const;
void clear(void);
protected:
friend class ProfilingMeasurementCollection;
template <typename S> friend bool serialize(S &s, const ProfilingRequestSet &prs);
template <typename S> friend bool deserialize(S &s, ProfilingRequestSet &prs);
std::vector<ProfilingRequest *> requests;
};
class REALM_INTERNAL_API ProfilingMeasurementCollection {
public:
ProfilingMeasurementCollection(void);
~ProfilingMeasurementCollection(void);
void import_requests(const ProfilingRequestSet& prs);
void send_responses(const ProfilingRequestSet& prs);
void clear(void);
// clears only recorded measurements (keeps request info)
void clear_measurements(void);
template <typename T>
bool wants_measurement(void) const;
template <typename T>
void add_measurement(const T& data, bool send_complete_responses = true);
protected:
void send_response(const ProfilingRequest& pr) const;
// in order to efficiently send responses as soon as we have all the requested measurements, we
// need to know which profiling requests are needed by a given measurement and how many more
// measurements each request wants
std::map<ProfilingMeasurementID, std::vector<const ProfilingRequest *> > requested_measurements;
std::map<const ProfilingRequest *, int> measurements_left;
bool completed_requests_present; // set if a request is completed but could not be sent right away
std::map<ProfilingMeasurementID, ByteArray> measurements;
};
class REALM_PUBLIC_API ProfilingResponse {
public:
// responses need to be deserialized from the response task's argument data
ProfilingResponse(const void *_data, size_t _data_size);
~ProfilingResponse(void);
const void *user_data(void) const;
size_t user_data_size(void) const;
// even if a measurement was requested, it may not have been performed - use
// this to check
template <typename T>
bool has_measurement(void) const;
// extracts a measurement (if available), returning a dynamically allocated result -
// caller should delete it when done
template <typename T>
T *get_measurement(void) const;
// extracts a measurement (if available), filling in a caller-allocated
// result - returns true if result available, false if not
template <typename T>
bool get_measurement(T& result) const;
protected:
const char *data;
size_t data_size;
int measurement_count;
size_t user_data_offset;
const int *ids;
REALM_INTERNAL_API_EXTERNAL_LINKAGE
bool find_id(int id, int& offset, int& size) const;
};
}; // namespace Realm
#include "realm/profiling.inl"
#endif // ifdef REALM_PROFILING_H