Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tracing improvements #1492

Merged
merged 14 commits into from
Jun 28, 2024
73 changes: 8 additions & 65 deletions programming_examples/utils/parse_eventIR.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
import shutil
import os
from aie.utils.trace_events_enum import CoreEvent, MemEvent, PLEvent, MemTileEvent

# Number of different trace types, currently 4
# core: pkt type 0
Expand Down Expand Up @@ -728,72 +729,14 @@ def parse_mlir_trace_events(lines):
def lookup_event_name_by_type(trace_type, code):
# def lookup_event_name_by_type(trace_type, loc, event, pid_events):
event = ""
# Core traces
# code = pid_events[trace_type][loc][event]
if trace_type == 0:
if code == 0x1:
event = "True"
elif code == 23: # 0x17:
event = "MemoryStall"
elif code == 24: # 0x18:
event = "StreamStall"
elif code == 26: # 0x1A:
event = "LockStall"
elif code == 32: # 0x20, all events 33-45
event = "CoreProgramFlow"
elif code == 33: # 0x21:
event = "Event0"
elif code == 34: # 0x22:
event = "Event1"
elif code == 37: # 0x25:
event = "VectorInstr"
elif code == 38: # 0x26:
event = "InstrLoad"
elif code == 39: # 0x27:
event = "InstrStore"
elif code == 44: # 0x2C:
event = "LockAcquireInstr"
elif code == 45: # 0x2D:
event = "LockReleaseInstr"
elif code == 75: # 0x4B:
event = "PortRunning0"
elif code == 79: # 0x4F:
event = "PortRunning1"
else:
event = "Unknown"
# Mem traces
elif trace_type == 1:
# TODO Need to define these
if code == 21: # x15
event = "DMA s2mm 0 start bd"
elif code == 22: # x16
event = "DMA s2mm 1 start bd"
elif code == 23: # x17
event = "DMA mm2s 0 start bd"
elif code == 24: # x18
event = "DMA mm2s 1 start bd"
elif code == 25: # x19
event = "DMA s2mm 0 finish bd"
elif code == 26: # x1a
event = "DMA s2mm 1 finish bd"
elif code == 27: # x1b
event = "DMA mm2s 0 finish bd"
elif code == 28: # x1c
event = "DMA mm2s 1 finish bd"
elif code == 29: # x1d
event = "DMA s2mm 0 idle"
elif code == 30: # x1e
event = "DMA s2mm 1 idle"
elif code == 31: # x1f
event = "DMA mm2s 0 idle"
elif code == 32: # x20
event = "DMA mm2s 1 idle"
elif code == 33: # x21
event = "DMA s2mm 0 stalled lock acquire"
elif code == 34: # x22
event = "DMA s2mm 1 stalled lock acquire"
else:
event = "Unknown"
events_enum = None
if trace_type == 0: # Core traces
events_enum = CoreEvent
elif trace_type == 1: # Mem traces
events_enum = MemEvent
if events_enum is not None and code in set(x.value for x in events_enum):
event = events_enum(code).name
else:
event = "Unknown"
return event
Expand Down
95 changes: 8 additions & 87 deletions programming_examples/utils/parse_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import sys
import re
from aie.utils.trace_events_enum import CoreEvent, MemEvent, PLEvent, MemTileEvent

# Number of different trace types, currently 4
# core: pkt type 0
Expand Down Expand Up @@ -750,94 +751,14 @@ def parse_mlir_trace_events(lines):
def lookup_event_name_by_type(trace_type, code):
# def lookup_event_name_by_type(trace_type, loc, event, pid_events):
event = ""
# Core traces
# code = pid_events[trace_type][loc][event]
if trace_type == 0:
if code == 0x1:
event = "True"
elif code == 24: # 0x18:
event = "StreamStall"
elif code == 26: # 0x1A:
event = "LockStall"
elif code == 32: # 0x20, all events 33-45
event = "CoreProgramFlow"
elif code == 33: # 0x21:
event = "Event0"
elif code == 34: # 0x22:
event = "Event1"
elif code == 37: # 0x25:
event = "VectorInstr"
elif code == 38: # 0x26:
event = "InstrLoad"
elif code == 39: # 0x27:
event = "InstrStore"
elif code == 44: # 0x2C:
event = "LockAcquireInstr"
elif code == 45: # 0x2D:
event = "LockReleaseInstr"
elif code == 75: # 0x4B:
event = "PortRunning0"
elif code == 79: # 0x4F:
event = "PortRunning1"
else:
event = "Unknown"
# Mem traces
elif trace_type == 1:
# TODO Need to define these
if code == 0x1:
event = "True"
elif code == 21: # x15
event = "DMA s2mm 0 start bd"
elif code == 22: # x16
event = "DMA s2mm 1 start bd"
elif code == 23: # x17
event = "DMA mm2s 0 start bd"
elif code == 24: # x18
event = "DMA mm2s 1 start bd"
elif code == 25: # x19
event = "DMA s2mm 0 finish bd"
elif code == 26: # x1a
event = "DMA s2mm 1 finish bd"
elif code == 27: # x1b
event = "DMA mm2s 0 finish bd"
elif code == 28: # x1c
event = "DMA mm2s 1 finish bd"
elif code == 29: # x1d
event = "DMA s2mm 0 idle"
elif code == 30: # x1e
event = "DMA s2mm 1 idle"
elif code == 31: # x1f
event = "DMA mm2s 0 idle"
elif code == 32: # x20
event = "DMA mm2s 1 idle"
elif code == 33: # x21
event = "DMA s2mm 0 stalled lock acquire"
elif code == 34: # x22
event = "DMA s2mm 1 stalled lock acquire"
else:
event = "Unknown"
# memtile traces
elif trace_type == 3:
if code == 0x1:
event = "True"
elif code == 80: # 0x50
event = "PortRunning0"
elif code == 84: # 0x54
event = "PortRunning1"
elif code == 88: # 0x58
event = "PortRunning2"
elif code == 92: # 0x5C
event = "PortRunning3"
elif code == 96: # 0x60
event = "PortRunning4"
elif code == 100: # 0x64
event = "PortRunning5"
elif code == 104: # 0x68
event = "PortRunning6"
elif code == 108: # 0x6C
event = "PortRunning7"
else:
event = "Unknown"
events_enum = None
if trace_type == 0: # Core traces
events_enum = CoreEvent
elif trace_type == 1: # Mem traces
events_enum = MemEvent
if events_enum is not None and code in set(x.value for x in events_enum):
event = events_enum(code).name
else:
event = "Unknown"
return event
Expand Down
24 changes: 16 additions & 8 deletions programming_guide/section-4/section-4b/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,14 +220,22 @@ Open https://ui.perfetto.dev in your browser and then open up the waveform json
Based on this wave, You can mouse over each chunk of continguous data for `PortRunning0` (input dma port) and `PortRunning1` (output dma port). What is the chunk size? <img src="../../../mlir_tutorials/images/answer1.jpg" title="1024" height=25> How many input and output chunks are there? <img src="../../../mlir_tutorials/images/answer1.jpg" title="4 inputs and 4 outputs (last output might be truncated in viewer)" height=25> This should match iteration loop bounds in our example design.

Here, there are a few common events in our waveform that's further described below.
* `Event0` - The event marking the beginning of our kernel. See [vector_scalar_mul.cc](./vector_scalar_mul.cc) where we added the function `event0()` before the loop. This is generally a handy thing to do to attach an event to the beginning of our kernel.
* `Event1` - The event marking the end of our kernel. See [vector_scalar_mul.cc](./vector_scalar_mul.cc) where we added the function `event1()` after the loop. Much like event0, attaching event1 to the end of our kernel is also helpful.
* `VectorInstr` - Vector instructions like vector MAC or vector load/store. Here, we are running a scalar implementation so there are no vector events.
* `PortRunning0` - Mapped to Port 0 which is by default configured to the S2MM0 input (DMA from stream to local memory). This is usually the first input.
* `PortRunning1` - Mapped to Port 1 which is by default configured to the MM2S0 output (DMA from local memory to stream). This is usually the first output.
* `LockStall` - Any locks stalls
* `LockAcquiresInstr` - Any lock acquire requests
* `LockReleaseInstr` - Any lock release requests
* `INSTR_EVENT_0` - The event marking the beginning of our kernel. See [vector_scalar_mul.cc](./vector_scalar_mul.cc) where we added the function `event0()` before the loop. This is generally a handy thing to do to attach an event to the beginning of our kernel.
* `INSTR_EVENT_1` - The event marking the end of our kernel. See [vector_scalar_mul.cc](./vector_scalar_mul.cc) where we added the function `event1()` after the loop. Much like event0, attaching event1 to the end of our kernel is also helpful.
* `INSTR_VECTOR` - Vector instructions like vector MAC or vector load/store. Here, we are running a scalar implementation so there are no vector events.
* `PORT_RUNNING_0` up to `PORT_RUNNING_7` - You can listen for a variety of events, such as `PORT_RUNNING`, `PORT_IDLE` or `PORT_STALLED` on up to 7 ports. To select which port to listen to, use the `PortEvent` Python class as your event. For example, to listen to master port 1:
```
from aie.utils.trace import configure_simple_tracing_aie2, PortEvent
from aie.utils.trace_events_enum import CoreEvent, MemEvent, PLEvent, MemTileEvent
trace_utils.configure_simple_tracing_aie2(
# ... other arguments as above
events=[trace_utils.PortEvent(CoreEvent.PORT_RUNNING_0, 1, master=True)]
)
```
* `PORT_RUNNING_1` - Mapped to Port 1 which is by default configured to the MM2S0 output (DMA from local memory to stream). This is usually the first output.
* `LOCK_STALL` - Any locks stalls
* `INSTR_LOCK_ACQUIRE_REQ` - Any lock acquire requests
* `INSTR_LOCK_RELEASE_REQ` - Any lock release requests

We will look at more exercises with Trace and performance measurement in the next [section](../section-4c).

Expand Down
1 change: 1 addition & 0 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ declare_mlir_python_sources(AIEPythonSources.Utils
utils/xrt.py
utils/ml.py
utils/trace.py
utils/trace_events_enum.py
)

declare_mlir_python_sources(AIEPythonSources.Extras
Expand Down
63 changes: 53 additions & 10 deletions python/utils/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
//
//===----------------------------------------------------------------------===//-->

# <ins>Python Utilities</ins>
# Python Utilities

The python utilties are designed to simplify commonly repeated tasks and wrap them up into helper functions. They are divided into separate categories and can be added to any python code via:
```
import aie.utils.trace as trace_utils
import aie.utils.test as test_utils
```
Thereafter, functions defined in the file can be called via `trace_utils.configure_simple_tracing_aie2(...)`.

Expand All @@ -21,7 +22,7 @@ Thereafter, functions defined in the file can be called via `trace_utils.configu
- [XRT utilities](#xrt-utilites-xrtpy) ([xrt.py](./xrt.py))
- [Machine Learning (ML) utilities](#machine-language-ml-utilites-mlpyss) ([ml.py](./ml.py))

## <u>Test utilites ([test.py](./test.py))</u>
## Test utilites ([test.py](./test.py))
Test/ Host code utilities.
* `create_default_argparser`
* This creates a ArgumentParser with the following args: --xclbin, --kernel, --instr, -v, --verify, --iters, --warmup, --trace_sz, --trace_file
Expand All @@ -36,10 +37,10 @@ Test/ Host code utilities.
* Declare hardware context and use that to return the `device` and `kernel`


## <u>Trace utilites ([trace.py](./trace.py))</u>
## Trace utilites ([trace.py](./trace.py))

* `extract_trace`
* Used in some jupyter notebook python examples. Given the output buffer, its shape and dtype and the trace_size, it returns the output buffer only (as outptu_prefix) and the trace buffer (as trace_suffix)
* Used in some jupyter notebook python examples. Given the output buffer, its shape and dtype and the trace_size, it returns the output buffer only (as output_prefix) and the trace buffer (as trace_suffix)
* However, the process of extracting the output_buffer and trace_buffer can also be as simple as:
```python
entire_buffer = bo_inout1.read(OUT_SIZE, 0).view(np.uint32)
Expand All @@ -62,7 +63,7 @@ Test/ Host code utilities.
* `offset`- offset (in bytes) where trace buffer data should begin
* `start`- start event
* `stop`- stop event
* `events`- Vector of 8 events that we are tracing
* `events`- Vector of up to 8 events that we are tracing; these can be any from the `trace_events_enum` described below

The minimum function call supported is:
```python
Expand All @@ -76,7 +77,18 @@ Test/ Host code utilities.
* `offset`=0 - An offset=0 means the trace data is in its own inout buffer (not appended to another channel)
* `start`=0x1 - Start event triggers right away when tile is enabled
* `stop`=0x0 - No Stop event
* `events`=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F] - standard template of events commonly used
* `events` - a standard template of events commonly used below as below:
```
events=[ CoreEvent.INSTR_EVENT_1,
CoreEvent.INSTR_EVENT_0,
CoreEvent.INSTR_VECTOR,
CoreEvent.INSTR_LOCK_RELEASE_REQ,
CoreEvent.INSTR_LOCK_ACQUIRE_REQ,
CoreEvent.LOCK_STALL,
PortEvent(CoreEvent.PORT_RUNNING_0, 1, True), # master(1)
PortEvent(CoreEvent.PORT_RUNNING_1, 1, False), # slave(1)
]
```

A more common use case might be:
```python
Expand All @@ -86,7 +98,38 @@ Test/ Host code utilities.

To better appreciate what this wrapper function does, we need to delve more deeply into the details on how trace units are configured.

### <u>Configure tile trace settings</u>
### Available Events for Tracing - `trace_events_enum.py`

`trace_events_enum.py` contains a list of all traceable events on AIE-ML devices.
These include events on compute cores (`CoreEvent`), memory modules (`MemEvent`), programmable logic (`PLEvent`) and mem tiles (`MemTileEvent`).
When specifying a list of events to `configure_simple_tracing_aie2`, you can refer to events either by their name or their numeric values:
```
configure_simple_tracing_aie2(..., events=[0x4B, 0x22, 0x21, 0x25])
# or, equivalently:
configure_simple_tracing_aie2(..., events=[CoreEvent.INSTR_EVENT_1, CoreEvent.INSTR_EVENT_0, CoreEvent.INSTR_VECTOR, CoreEvent.INSTR_LOCK_RELEASE_REQ])
```

#### Port Events

There is a set of events that fire on certain activity on data memory ports.
These are `PORT_IDLE_0` through `PORT_IDLE_7`, `PORT_RUNNING_0` through `PORT_RUNNING_7`, `PORT_STALLED_0` throught `PORT_STALLED_7` and finally `PORT_TLAST_0` through `PORT_TLAST_7`.
You have to specify on which port the tracing engine should listen for each those events.
In hardware, this is done by configuring registers `0x3FF00` and `0x3FF04`.
The Python tracing utilities abstract this in `configure_simple_tracing_aie2`; you only have to specify the event as a `PortEvent` along with the corresponding port as follows:

```
configure_tracing_aie2(
...,
events=[
PortEvent(CoreEvent.PORT_RUNNING_0, 1, master=True)
# This will emit an event whenever master port 1 is running.
]
)
```

`PortEvent` is defined in `aie.utils.trace` and `CoreEvent` is defined in `aie.utils.trace_events_enum`.

### Configure tile trace settings
Within the `func.func @sequence` block, we call a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and (`aiex.npu.writebd`) to configure the shimDMA.

For a give AIE2 tile, we configure the trace control registers for the tile core and tile memory separately. There are 4 registers we generally use to configure the trace unit behavior. 2 are for configuring the general trace control and the other 2 are to specify which events our tile's trace hardware is monitoring.
Expand Down Expand Up @@ -241,7 +284,7 @@ npu_write32(column=0, row=4, address=0x3FF00, value=pack4bytes(0, 0, slave(1), m
npu_write32(column=0, row=4, address=0x3FF04, value=pack4bytes(0, 0, 0, 0),)
```

### <u>Configure shimDMA</u>
### Configure shimDMA

The shimDMA needs to be configured to write the trace stream data to a valid location in DDR memory to be read by the host code. In the case of the NPU, we can use a template like the following where the main parameters that need to be defined include `buffer_length`, `buffer_offset`, `bd_id`, `ddr_id`, and `column`.

Expand Down Expand Up @@ -330,7 +373,7 @@ npu_writebd(
)
```

## <u>XRT utilites ([xrt.py](./xrt.py))</u>
## XRT utilites ([xrt.py](./xrt.py))
XRT wrapped utilities

* class `AIE_Applications`
Expand All @@ -342,7 +385,7 @@ XRT wrapped utilities
* `write_out_trace`
* `execute`

## <u>Machine Language (ML) utilites ([ml.py](./ml.py))</u>
## Machine Language (ML) utilites ([ml.py](./ml.py))
ML related utilties

* class `CSVLogger`
Expand Down
Loading
Loading