In [1]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-yv_z3qmb
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-yv_z3qmb
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=f8456cc4fac72e80e1b4ea4025de3844b78f148e13f10952a5b90711cf72f585
  Stored in directory: /tmp/pip-ephem-wheel-cache-li9vb7a5/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [3]:
%load_ext nvcc_plugin


created output directory at /content/src
Out bin /content/result.out


In [4]:
%%cu
#include <stdio.h>

// In this example we use a very small number of blocks
// and threads in those blocks for illustration 
// on a very small array
#define N 10000
#define numThread 4 // 2 threads in a block
#define numBlock 2500  // 4 blocks

/*
 * 1.
 *  The 'kernel' function that will be executed on the GPU device hardware.
 */
__global__ void add( int *a, int *b, int *c ) {

    // the initial index that this thread will work on
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    
    // In this above example code, we assume a linear set of blocks of threads in the 'x' dimension,
    // which is declared in main below when we run this function.

    // The actual computation is being done by individual threads
    // in each of the blocks.
    // e.g. we use 4 blocks and 2 threads per block (8 threads will run in parallel)
    //      and our total array size N is 8
    //      the thread whose threadIdx.x is 0 within block 0 will compute c[0],
    //          because tid = (2 * 0)  + 0
    //      the thread whose threadIdx.x is 0 within block 1 will compute c[2],
    //          because tid = (2 * 1) + 0
    //      the thread whose threadIdx.x is 1 within block 1 will compute c[3],
    //          because tid = (2 * 1) + 1
    //
    //     The following while loop will execute once for this simple example:
    //          c[0] through c[7] will be computed concurrently
    //
    while (tid < N) {
        c[tid] = a[tid] + b[tid];       // The actual computation done by the thread
        tid += blockDim.x;       // Increment this thread's index by the number of threads per block:
                                 // in this small case, each thread would then have a tid > N
    }
}


/*
 * The main program that directs the execution of vector add on the GPU
 */
int main( void ) {
    int *a, *b, *c;               // The arrays on the host CPU machine
    int *dev_a, *dev_b, *dev_c;   // The arrays for the GPU device

    // 2.a allocate the memory on the CPU
    a = (int*)malloc( N * sizeof(int) );
    b = (int*)malloc( N * sizeof(int) );
    c = (int*)malloc( N * sizeof(int) );

    // 2.b. fill the arrays 'a' and 'b' on the CPU with dummy values
    for (int i=0; i<N; i++) {
        a[i] = i;
        b[i] = i;
    }

    // 2.c. allocate the memory on the GPU
     cudaMalloc( (void**)&dev_a, N * sizeof(int) );
     cudaMalloc( (void**)&dev_b, N * sizeof(int) );
     cudaMalloc( (void**)&dev_c, N * sizeof(int) );

    // 2.d. copy the arrays 'a' and 'b' to the GPU
     cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice );
     cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice );

    // 3. Execute the vector addition 'kernel function' on th GPU device,
    // declaring how many blocks and how many threads per block to use.
    add<<<numBlock,numThread>>>( dev_a, dev_b, dev_c );

    // 4. copy the array 'c' back from the GPU to the CPU
    cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost );

    // verify that the GPU did the work we requested
    bool success = true;
    int total=0;
    printf("Checking %d values in the array.\n", N);
    for (int i=0; i<N; i++) {
        if ((a[i] + b[i]) != c[i]) {
            printf( "Error:  %d + %d != %d\n", a[i], b[i], c[i] );
            success = false;
        }
        total += 1;
    }
    if (success)  printf( "We did it, %d values correct!\n", total );

    // free the memory we allocated on the CPU
    free( a );
    free( b );
    free( c );

    // free the memory we allocated on the GPU
     cudaFree( dev_a );
     cudaFree( dev_b );
     cudaFree( dev_c );

    return 0;
}


Checking 10000 values in the array.
We did it, 10000 values correct!

