In [None]:
%%writefile Sources/cub.cu
#include "dli.h"

void simulate(dli::temperature_grid_f temp_in, float *temp_out, cudaStream_t stream)
{
  auto cell_ids = thrust::make_counting_iterator(0);
  cub::DeviceTransform::Transform(
    cell_ids, temp_out, temp_in.size(), 
    [temp_in] __host__ __device__ (int cell_id) { 
      return dli::compute(cell_id, temp_in); 
    }, stream);
}

In [None]:
!nvcc --extended-lambda -o /tmp/a.out Sources/cub.cu # build executable
!/tmp/a.out # run executable

In [None]:
%%writefile Sources/simple-kernel.cu
#include "dli.h"

__global__ void single_thread_kernel(dli::temperature_grid_f in, float *out)
{
  for (int id = 0; id < in.size(); id++) 
  {
    out[id] = dli::compute(id, in);
  }
}

void simulate(dli::temperature_grid_f temp_in, float *temp_out, cudaStream_t stream)
{
  single_thread_kernel<<<1, 1, 0, stream>>>(temp_in, temp_out);
}

In [None]:
!nvcc --extended-lambda -o /tmp/a.out Sources/simple-kernel.cu # build executable
!/tmp/a.out # run executable

In [None]:
%%writefile Sources/block-256-kernel.cu
#include "dli.h"

const int number_of_threads = 256;

__global__ void block_kernel(dli::temperature_grid_f in, float *out)
{
  int thread_index = threadIdx.x;

  for (int id = thread_index; id < in.size(); id += number_of_threads) 
  {
    out[id] = dli::compute(id, in);
  }
}

void simulate(dli::temperature_grid_f temp_in, float *temp_out, cudaStream_t stream)
{
  block_kernel<<<1, number_of_threads, 0, stream>>>(temp_in, temp_out);
}

In [None]:
!nvcc --extended-lambda -o /tmp/a.out Sources/block-256-kernel.cu # build executable
!/tmp/a.out # run executable

In [None]:
%%writefile Sources/grid-kernel.cu
#include "dli.h"

__global__ void grid_kernel(dli::temperature_grid_f in, float *out)
{
  int thread_index = blockDim.x * blockIdx.x + threadIdx.x;
  int number_of_threads = blockDim.x * gridDim.x;

  for (int id = thread_index; id < in.size(); id += number_of_threads) 
  {
    out[id] = dli::compute(id, in);
  }
}

int ceil_div(int a, int b) 
{
  return (a + b - 1) / b;
}

void simulate(dli::temperature_grid_f temp_in, float *temp_out, cudaStream_t stream)
{
  int block_size = 1024;
  int grid_size = ceil_div(temp_in.size(), block_size);

  grid_kernel<<<grid_size, block_size, 0, stream>>>(temp_in, temp_out);
}

In [None]:
!nvcc --extended-lambda -o /tmp/a.out Sources/grid-kernel.cu # build executable
!/tmp/a.out # run executable