In [None]:
!pip install -U featuretools
!pip install nvcc4jupyter
#!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting featuretools
  Downloading featuretools-1.28.0-py3-none-any.whl (619 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.2/619.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting holidays<0.33,>=0.13 (from featuretools)
  Downloading holidays-0.32-py3-none-any.whl (754 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m754.4/754.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting woodwork>=0.23.0 (from featuretools)
  Downloading woodwork-0.27.0-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.1/236.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: holidays, woodwork, featuretools
  Attempting uninstall: holidays
    Found existing installation: holidays 0.41
    Uninstalling holidays-0.41:
      Successfully uninstalled holidays-0.41
Successfully installed featuretools-1.28.0 holidays-0.32 woodwork-0.27.0
Collecting nvcc4jupyter
  Downloading nvcc4ju

In [None]:
%%writefile main.cu
#include <stdio.h>
#include <iostream>
#include <stdexcept>
#include <cassert>

#define N 1000000
#define num_threads_per_block 1024

__global__ void dot_product(int* a, int* b, int* output_buffer) {
  __shared__ int cache[num_threads_per_block];
  int index = (blockDim.x * blockIdx.x) + threadIdx.x;
  int cache_index = threadIdx.x;
  //                       # blocks * # threads per block.
  int total_num_threads = (gridDim.x * blockDim.x);

  while (index < N) {
    cache[cache_index] += a[index] * b[index];
    index += total_num_threads;
  }

  // Wait for all the threads in the block to complete.
  __syncthreads();

  // Sum all the entries in shared memory.
  // This can be done in O(LogN) time.
  int left = threadIdx.x;
  int offset = num_threads_per_block/2;
  // The cache length must be a power of 2.
  while (offset != 0) { // Quit when left + offset == 0

    if (left < offset) {
      cache[left] += cache[left + offset];
    }
    offset = offset/2;
    __syncthreads();
  }


  if (threadIdx.x == 0) {
    // Write to the output buffer the result for this block.
    output_buffer[blockIdx.x] = cache[0];
  }
}


int main() {

  int num_blocks = 14;
  int *a_host, *b_host, *block_results_host;
  int *a_device, *b_device, *block_results_device;

  // Allocate space on the CPU for the arrays.
  a_host = (int*) malloc(sizeof(int) * N);
  b_host = (int*) malloc(sizeof(int) * N);
  block_results_host = (int*) malloc(sizeof(int) * num_blocks);

  // Allocate space on the GPU for the arrays.
  cudaMalloc((void**)&a_device, sizeof(int) * N);
  cudaMalloc((void**)&b_device, sizeof(int) * N);
  cudaMalloc((void**)&block_results_device, sizeof(int) * num_blocks);

  // Write to a and b.
  for (int i = 0; i < N; i++) {
    a_host[i] = 2;
    b_host[i] = 12;
  }

  for (int i = 0; i < num_blocks; i++) {
    block_results_host[i] = 0;
  }

  // Copy a and b to the GPU.
  cudaMemcpy(a_device, a_host, sizeof(int) * N, cudaMemcpyHostToDevice);
  cudaMemcpy(b_device, b_host, sizeof(int) * N, cudaMemcpyHostToDevice);
  cudaMemcpy(block_results_device, block_results_host, sizeof(int) * num_blocks, cudaMemcpyHostToDevice);

  // Run the kernel.
  dot_product<<<1, 1>>>(a_device, b_device, block_results_device);

  // Copy the results buffer back from the GPU.
  cudaMemcpy(block_results_host, block_results_device, sizeof(int) * num_blocks, cudaMemcpyDeviceToHost);

  // Sum all the entries in the result buffer.
  int sum = 0;
  for (int i = 0; i < num_blocks; i++) {
    sum += block_results_host[i];
  }

  assert(sum == (2 * 12 * N));
  std::cout << "Works!" << std::endl;
  return 0;
}

Writing main.cu


In [None]:
%%script bash
nvcc main.cu -o dot_product

In [None]:
%%script bash
./dot_product

Works!
