# Benchmarking

The goal is to write Kernels targeting CPU, SIMD, GPU, Python/C/C++ etc and do a performance analysis

In [2]:
import torch

In [42]:
from torch.utils.cpp_extension import load_inline

In [47]:
commonCppHeader = """
#include <torch/extension.h>
#include <iostream>
#include <vector>
#include <arm_neon.h>
using namespace std;
"""

In [None]:
vectorAddCppNeon = """
void vectorAddNeon(uint8_t* a, uint8_t* b, uint8_t* c, int n) {
  for (int i = 0; i < n; i += 16) {
    uint8x16_t va = vld1q_u8(a + i);
    uint8x16_t vb = vld1q_u8(b + i);
    uint8x16_t vc = vaddq_u8(va, vb);
    vst1q_u8(c + i, vc);
  }
}

void printHi(string str="Hi") {
  cout << str << endl;
}
"""


In [None]:
vectorAddModule = load_inline(
  name='vectorAddNeonModule',
  cpp_sources=[commonCppHeader, vectorAddCppNeon],
  functions=['vectorAddNeon', 'printHi'],
  verbose=True
)



In [None]:
torch.utils.cpp_extension.include_paths()[0]

In [None]:
vectorAddModule.printHi("LMFAO")
a = torch.randint(0, 20, (100,), dtype=torch.uint8)
b = torch.randint(0, 20, (100,), dtype=torch.uint8)
c = torch.zeros(100, dtype=torch.uint8)
# # Pass pointers of a,b,c to the C++ function
vectorAddModule.vectorAddNeon(a.data_ptr(), b.data_ptr(), c.data_ptr(), 100)
# print(c)

In [3]:
def timePyTorchFunction(f, input):
  start = torch.mps.Event(enable_timing=True)
  end = torch.mps.Event(enable_timing=True)
  for _ in range(5):
    f(input)
  start.record()
  for _ in range(1000):
    f(input)
  end.record()
  torch.mps.synchronize()
  return start.elapsed_time(end)

In [4]:
def square(x):
  return x * x

def square_2(x):
  return x ** 2

def identity(x):
  return x

In [39]:
# make a Pytorch Int32 array
b = torch.randint(0, 1000, (10000,), dtype=torch.int32)

In [71]:
times = {}
times["torch.square"] = timePyTorchFunction(torch.square, b)
times["Python x ** 2"] = timePyTorchFunction(square, b)
times["x * x"] = timePyTorchFunction(square_2, b)
times["neon"] = timePyTorchFunction(squareInt32Module.square_int32_neon, b.data_ptr(), c.data_ptr(), 10000, b)

times

TypeError: timePyTorchFunction() takes 2 positional arguments but 5 were given

In [52]:
# write a Neon square C++ function

squareInt32CppNeon = """
void square_neon(int32_t* input, int32_t* output, int64_t size) {
    int64_t i = 0;

    // Vectorized processing using ARM NEON for int32
    for (; i <= size - 4; i += 4) {
        int32x4_t v = vld1q_s32(input + i);  // Load 4 int32 values
        int32x4_t v_sq = vmulq_s32(v, v);    // Square each element
        vst1q_s32(output + i, v_sq);         // Store back to memory
    }

    // Handle remaining elements
    for (; i < size; i++) {
        output[i] = input[i] * input[i];
    }
}

// Wrapper function to receive pointers as integers from Python
void square_int32_neon(uintptr_t input_ptr, uintptr_t output_ptr, int64_t size) {
    int32_t* input = reinterpret_cast<int32_t*>(input_ptr);
    int32_t* output = reinterpret_cast<int32_t*>(output_ptr);

    square_neon(input, output, size);
}
"""

squareInt32Module = load_inline(
  name='squareInt32NeonModule',
  cpp_sources=[commonCppHeader, squareInt32CppNeon],
  functions=['square_int32_neon'],
  verbose=True
)


Using /Users/viranchee/Library/Caches/torch_extensions/py39_cpu as PyTorch extensions root...
The input conditions for extension module squareInt32NeonModule have changed. Bumping to version 3 and re-building as squareInt32NeonModule_v3...
Emitting ninja build file /Users/viranchee/Library/Caches/torch_extensions/py39_cpu/squareInt32NeonModule/build.ninja...
Building extension module squareInt32NeonModule_v3...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/2] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=squareInt32NeonModule_v3 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_clang\" -DPYBIND11_STDLIB=\"_libcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1002\" -isystem /Volumes/code/env/pymetal/lib/python3.9/site-packages/torch/include -isystem /Volumes/code/env/pymetal/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /Users/viranchee/Library/Caches/torch_extensions/py39_cpu/squareInt32NeonModule/main.cpp -o main.o 
[2/2] c++ main.o -shared -L/Volumes/code/env/pymetal/lib/python3.9/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -undefined dynamic_lookup -o squareInt32NeonModule_v3.so


Loading extension module squareInt32NeonModule_v3...


In [69]:
c = torch.zeros(10000, dtype=torch.int32)
if (b.is_contiguous() == False):
  b = b.contiguous()
squareInt32Module.square_int32_neon(b.data_ptr(), c.data_ptr(), 10000)

In [62]:

with torch.autograd.profiler.profile(use_cpu=True) as prof:
  squareInt32Module.square_int32_neon(b.data_ptr(), c.data_ptr(), 10000)

# print(prof.key_averages().table(sort_by="cpu_time", row_limit=10))
prof.key_averages()


[]

In [60]:
prof.key_averages()

[]

In [67]:
c[0] = 5

In [70]:
assert(torch.all(c == b * b))