In [None]:
%%writefile Sources/simple-shmem.cu
#include <cstdio>

__global__ void kernel()
{
  __shared__ int shared[4];
  shared[threadIdx.x] = threadIdx.x;
  __syncthreads();

  if (threadIdx.x == 0)
  {
    for (int i = 0; i < 4; i++) {
      std::printf("shared[%d] = %d\n", i, shared[i]);
    }
  }
}

int main() {
  kernel<<<1, 4>>>();
  cudaDeviceSynchronize();
  return 0;
}

In [None]:
!nvcc -o /tmp/a.out Sources/simple-shmem.cu && /tmp/a.out

In [None]:
%%writefile Sources/shmem.cu
#include "dli.cuh"

__global__ void histogram_kernel(cuda::std::span<float> temperatures,
                                 cuda::std::span<int> histogram) {
  __shared__ int block_histogram[10];

  if (threadIdx.x < 10) {
    block_histogram[threadIdx.x] = 0;
  }
  __syncthreads();

  int cell = blockIdx.x * blockDim.x + threadIdx.x;
  int bin = static_cast<int>(temperatures[cell] / 10);

  cuda::atomic_ref<int, cuda::thread_scope_block> block_ref(
      block_histogram[bin]);
  block_ref.fetch_add(1);
  __syncthreads();

  if (threadIdx.x < 10) {
    cuda::atomic_ref<int, cuda::thread_scope_device> ref(
        histogram[threadIdx.x]);
    ref.fetch_add(block_histogram[threadIdx.x]);
  }
}

void histogram(cuda::std::span<float> temperatures,
               cuda::std::span<int> block_histograms,
               cuda::std::span<int> histogram, cudaStream_t stream) {
  int block_size = 256;
  int grid_size = cuda::ceil_div(temperatures.size(), block_size);
  histogram_kernel<<<grid_size, block_size, 0, stream>>>(temperatures,
                                                         histogram);
}

In [None]:
import Sources.dli
Sources.dli.run("Sources/shmem.cu")