In [1]:
# This should say you're using a GPU.
# If you aren't using a GPU, go to "Runtime",
# then select "change runtime type" and click
# T4 GPU.

!nvidia-smi

Mon Oct 16 04:08:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   46C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

In [None]:
!git clone https://github.com/WAT-ai/kernel_tuner

In [None]:
!pip install kernel_tuner[cuda]

### Enhancing Convolution Operations with Correctness Verification

In the context of computational operations utilizing convolution kernels, this example operates on a framework that largely mirrors a standard Convolution example. However, a nuanced distinction emerges in the strategic utilization of a naive kernel to establish a reference output. This output subsequently assumes a pivotal role, serving as a benchmark to verify the correctness of each kernel before its performance is benchmarked.

Here's a breakdown of the process without code examples:

1. **Kernel and Data Initialization**:
    - The kernel code is fetched from a predefined file and various data, including filter sizes and problem sizes, is initialized.
    - Input data and filters are set up and computational arguments are defined to facilitate subsequent operations.
  
2. **Parameter Tuning Setup**:
    - A structured collection of tuning parameters is configured, comprising diverse block and tile sizes.
    - Options to utilize or omit padding and the read-only cache are incorporated, providing versatile tuning capabilities.

3. **Reference Output Generation**:
    - A naive convolution kernel is executed, wielding pre-specified parameters, to generate a reference output.
    - This output is crucial as it functions as a benchmark for accuracy in subsequent operations.

4. **Verification and Kernel Tuning**:
    - An 'answer' list is set up, which marks input data as `None` and utilizes non-`None` data for verification purposes.
    - The `tune_kernel` function is invoked to tune the kernel, parallely cross-verifying each kernel in the parameter space against the pre-established reference output.
   
This methodology pivots around utilizing a naive kernel to derive a reference output and subsequently leveraging it for correctness checks during convolution operations, all before performance benchmarking occurs. This ensures computational reliability as it guarantees that only kernels which have been validated for correctness are subjected to the tuning and optimization phase.

In [4]:
import numpy
import kernel_tuner
from collections import OrderedDict
from kernel_tuner.strategies import *

def tune():
    with open("kernel_tuner/examples/cuda/convolution.cu", "r") as f:
        kernel_string = f.read()

    filter_size = (17, 17)
    problem_size = (4096, 4096)
    size = numpy.prod(problem_size)
    border_size = (filter_size[0] // 2 * 2, filter_size[1] // 2 * 2)
    input_size = (problem_size[0] + border_size[0]) * (problem_size[1] + border_size[1])

    output = numpy.zeros(size).astype(numpy.float32)
    input = numpy.random.randn(input_size).astype(numpy.float32)

    filter = numpy.random.randn(filter_size[0] * filter_size[1]).astype(numpy.float32)
    cmem_args = {"d_filter": filter}

    args = [output, input, filter]
    tune_params = OrderedDict()
    tune_params["filter_width"] = [filter_size[0]]
    tune_params["filter_height"] = [filter_size[1]]

    # tune_params["block_size_x"] = [16*i for i in range(1,3)]
    tune_params["block_size_x"] = [16 * i for i in range(1, 9)]
    # tune_params["block_size_y"] = [2**i for i in range(1,5)]
    tune_params["block_size_y"] = [2**i for i in range(1, 6)]

    tune_params["tile_size_x"] = [2**i for i in range(3)]
    tune_params["tile_size_y"] = [2**i for i in range(3)]

    tune_params["use_padding"] = [
        0,
        1,
    ]  # toggle the insertion of padding in shared memory
    tune_params["read_only"] = [0, 1]  # toggle using the read-only cache

    grid_div_x = ["block_size_x", "tile_size_x"]
    grid_div_y = ["block_size_y", "tile_size_y"]

    # compute the answer using a naive kernel
    params = {"block_size_x": 16, "block_size_y": 16}
    tune_params["filter_width"] = [filter_size[0]]
    tune_params["filter_height"] = [filter_size[1]]
    results = kernel_tuner.run_kernel(
        "convolution_naive",
        kernel_string,
        problem_size,
        args,
        params,
        grid_div_y=["block_size_y"],
        grid_div_x=["block_size_x"],
        lang="cupy",
    )

    # set non-output fields to None
    answer = [results[0], None, None]

    # start kernel tuning with correctness verification
    return kernel_tuner.tune_kernel(
        "convolution_kernel",
        kernel_string,
        problem_size,
        args,
        tune_params,
        grid_div_y=grid_div_y,
        grid_div_x=grid_div_x,
        verbose=True,
        cmem_args=cmem_args,
        answer=answer,
        lang="cupy",
        strategy="genetic_algorithm",
        strategy_options=dict(max_fevals=5)
    )

def numpy_int64_to_int(obj):
    if isinstance(obj, numpy.int64):
        return int(obj)
    raise TypeError

if __name__ == "__main__":
    import time

    s1 = time.time() * 1000
    results = tune()
    print(results)

    e1 = time.time() * 1000
    print("\n Actual time used:", e1 - s1)
    import json

    with open("convolution_gpu_runtime.json", "w") as fp:
        json.dump(results, fp, default=numpy_int64_to_int)


Using: Tesla T4
Using: Tesla T4
skipping config 17_17_64_16_4_4_0_0 reason: too much shared memory used
skipping config 17_17_64_16_4_4_0_0 reason: too much shared memory used
filter_width=17, filter_height=17, block_size_x=64, block_size_y=16, tile_size_x=4, tile_size_y=4, use_padding=0, read_only=0, time=CompilationFailedConfig
filter_width=17, filter_height=17, block_size_x=80, block_size_y=2, tile_size_x=4, tile_size_y=2, use_padding=0, read_only=1, time=9.727ms
filter_width=17, filter_height=17, block_size_x=112, block_size_y=2, tile_size_x=1, tile_size_y=4, use_padding=1, read_only=1, time=6.349ms
filter_width=17, filter_height=17, block_size_x=16, block_size_y=8, tile_size_x=1, tile_size_y=4, use_padding=1, read_only=0, time=8.633ms
filter_width=17, filter_height=17, block_size_x=64, block_size_y=4, tile_size_x=2, tile_size_y=4, use_padding=0, read_only=0, time=6.300ms
max_fevals reached
best performing configuration:
filter_width=17, filter_height=17, block_size_x=64, block_siz