-
Notifications
You must be signed in to change notification settings - Fork 10
/
test_2_loop_partitioning.py
220 lines (179 loc) · 9.83 KB
/
test_2_loop_partitioning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
.. seealso::
:py:mod:`codegen.test_1_local_padding`
"""
import filecmp
from flaky import flaky
import logging
import os
import numpy as np
logger = logging.getLogger(__name__)
from shared import CUDAContext, dietcode_decor, NoLocalPadding, \
DoLoopPartitioning, tolerance
from ops.shared.utils import get_time_evaluator_results_rpc_wrapper
@flaky(max_runs=3)
@dietcode_decor
def test_loop_partitioning():
"""
This test case shows the performance improvement brought by loop
partitioning, used as an optimization technique in the [Nimble]_ paper.
Similar to local padding, the goal of loop partitioning is also to mitigate
the performance overhead brought by out-of-boundary checks.
Given below is an example that illustrates how loop partitioning works:
.. code-block:: C++
for (i = 0; i < ceil(T / t); ++i)
for (j = 0: j < t; ++j)
if (i * t + j < T) // do something
The above code snippet can be transformed into
.. code-block:: C++
for (i = 0; i < floor(T / t); ++i)
for (j = 0: j < t; ++j)
// do something
for (j = 0; j < T - floor(T / t) * t; ++j)
// do something
which does not have any predicates.
We compare the compute throughputs between four schedules:
1. without any optimization (baseline)
2. loop partitioning
3. local padding
4. local padding + loop partitioning
The benchmark we use is the same as the one in local padding
(:py:func:`codegen.test_1_local_padding.test_local_padding`). Our
evaluations show that
1. Loop partitioning can also significantly boost the performance of the
generated CUDA kernel by as much as :math:`10\\times` (the same order
of speedup as local padding).
2. Although local padding and loop partitioning are orthogonal to each
other, applying them jointly is NOT able to boost the performance in
this case.
The table below illustrates the results we get from the CI workflow:
=========== ======== ================= ============= ===================
GPU Baseline Loop Partitioning Local Padding | Local Padding +
| Loop Partitioning
=========== ======== ================= ============= ===================
RTX 3090 ~0.98 ~11.2 ~11.6 ~11.0
RTX 2080 Ti ~0.43 ~5.98 ~5.27
=========== ======== ================= ============= ===================
where the numbers denote the compute throughputs (in TFLOPs/sec), and hence
the higher the better.
"""
from ops.dense.sample_schedule import dense_128x128x4
from ops.dense.fixture import Dense, cuBLASDenseFixture
B, T, I, H = 16, 60, 770, 2304
TFLOPs = 2 * B * T * I * H / 1e12
wkl_func_args = (B * T, I, H)
cublas_fixture = cuBLASDenseFixture(*wkl_func_args)
# temporarily disable local padding
with NoLocalPadding():
baseline_perf_results = get_time_evaluator_results_rpc_wrapper(
wkl_func=Dense,
wkl_func_args=wkl_func_args,
sched_func_or_str=dense_128x128x4,
fixture=cublas_fixture,
print_kernel=True
)
with NoLocalPadding(), DoLoopPartitioning():
loop_partitioning_perf_results = get_time_evaluator_results_rpc_wrapper(
wkl_func=Dense,
wkl_func_args=wkl_func_args,
sched_func_or_str=dense_128x128x4,
fixture=cublas_fixture,
print_kernel=True,
log_kernel_filename="temp_workspace.log",
verify_correctness=True
)
with DoLoopPartitioning():
partition_and_pad_perf_results = \
get_time_evaluator_results_rpc_wrapper(
wkl_func=Dense,
wkl_func_args=wkl_func_args,
sched_func_or_str=dense_128x128x4,
fixture=cublas_fixture,
print_kernel=True,
log_kernel_filename="temp_workspace_ii.log",
verify_correctness=True
)
baseline_tflops = TFLOPs / np.average(baseline_perf_results)
loop_partitioning_tflops = TFLOPs / np.average(loop_partitioning_perf_results)
partition_and_pad_tflops = TFLOPs / np.average(partition_and_pad_perf_results)
logger.info( "Baseline vs. Loop Partitioning vs. Loop Partition + Local Padding: "
f"{baseline_tflops} vs. {loop_partitioning_tflops} vs. "
f"{partition_and_pad_tflops} (TFLOPS)"
)
assert filecmp.cmp(os.path.dirname(os.path.realpath(__file__))
+ "/saved_artifacts/test_loop_partitioning.cu",
"temp_workspace.log")
if CUDAContext.device_name == 'NVIDIA GeForce RTX 3090':
np.testing.assert_allclose(baseline_tflops, 0.98, **tolerance)
np.testing.assert_allclose(loop_partitioning_tflops, 11.2, **tolerance)
np.testing.assert_allclose(partition_and_pad_tflops, 11.0, **tolerance)
if CUDAContext.device_name == 'NVIDIA GeForce RTX 2080 Ti':
np.testing.assert_allclose(baseline_tflops, 0.42, **tolerance)
np.testing.assert_allclose(loop_partitioning_tflops, 5.98, **tolerance)
@flaky(max_runs=3)
@dietcode_decor
def test_loop_partitioning_ii():
"""
Therefore, one natural question to ask is:
**What is the difference between local padding and loop partitioning, given
that they share the same objective?**
Although the two techniques are indeed similar in what they are going to
achieve, we pick local padding primarily due to the following reasons:
- Due to the *partition* nature, **loop partitioning needs to duplicate the
body statements** (as can be seen in the simple example illustrated
before, where the comment ``// do something`` appears twice). Depending on
the number of spatial axes, this duplication can happen multiple times.
This can significantly elongate the CUDA kernel body, and further lead to
a gigantic kernel in the case when the original kernel body is already
long (e.g., because of a large unrolling factor), **which eventually
increases the compilation time for the kernel** (can be several minutes
for a single kernel by our measurements).
- **There are cases that can be handled by local padding but NOT by loop
partitioning**. We refer to the example below, which is again the same as
the one in local padding
(:py:func:`codegen.test_1_local_padding.test_local_padding_ii`). In this
example, all the thread blocks possess the same predicates, which prevents
the loop partitioning from taking place (since there is no region that can
be partitioned out without predicates).
Nevertheless, there are still advantages of loop partitioning that local
padding does not have: Since **loop partitioning does not involve padding**,
it does not have to perform any redundant computations, which explains why
its performance can be better than local padding on RTX 2080 Ti (as can be
seen in the previous table). However, this is NOT a problem for the DietCode
auto-scheduling framework since we expect that the auto-scheduling outcomes
should have relatively low padding ratio.
.. [Nimble] H. Shen et al. *Nimble: Efficiently Compiling Dynamic Neural
Networks for Model Inference*. MLSys 2021
"""
from ops.batch_matmul.sample_schedule import batch_matmul_nt_1x128x128x8
from ops.batch_matmul.fixture import BatchMatmulNT, cuBLASBatchMatmulNTFixture
B, T, H, NH = 16, 120, 768, 12
TFLOPs = 2 * B * T * T * H / 1e12
wkl_func_args = (B * NH, T, H // NH, T)
cublas_fixture = cuBLASBatchMatmulNTFixture(*wkl_func_args)
# temporarily disable local padding
with NoLocalPadding():
baseline_perf_results = get_time_evaluator_results_rpc_wrapper(
wkl_func=BatchMatmulNT,
wkl_func_args=wkl_func_args,
sched_func_or_str=batch_matmul_nt_1x128x128x8,
fixture=cublas_fixture,
print_kernel=True,
log_kernel_filename="temp_workspace.log",
)
with NoLocalPadding(), DoLoopPartitioning():
loop_partitioning_perf_results = get_time_evaluator_results_rpc_wrapper(
wkl_func=BatchMatmulNT,
wkl_func_args=wkl_func_args,
sched_func_or_str=batch_matmul_nt_1x128x128x8,
fixture=cublas_fixture,
print_kernel=True,
log_kernel_filename="temp_workspace_ii.log",
verify_correctness=True
)
# Loop partitioning is not able to optimize for this case, hence there
# should NOT be any difference in the generated code.
assert filecmp.cmp("temp_workspace_ii.log", "temp_workspace.log")
baseline_tflops = TFLOPs / np.average(baseline_perf_results)
loop_partitioning_tflops = TFLOPs / np.average(loop_partitioning_perf_results)
logger.info(f"Baseline vs. Loop Partitioning: {baseline_tflops} vs. {loop_partitioning_tflops} (TFLOPS)")