In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

Install CUDA Version 9

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

Check the CUDA version

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


Run the given command to install an extension to run nvcc (NVIDIA CUDA Compiler) from the notebook cells.

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Load the extension 

In [5]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


Test by running the "Hello World!" CUDA Code.

In [6]:
%%cu

#include <bits/stdc++.h>
using namespace std;

__global__ void hello(){
    printf("Hello World!\n");
}

int main() {
    hello<<<1,1>>>();
    cudaDeviceSynchronize();
    return 0;
}

Hello World!



Vector Addition

In [7]:
%%cu

#include <bits/stdc++.h>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>

#define N 100000000


void addvec(int *sum, int *a, int *b, int n) {
    for(int i = 0; i < n; i++) {
        sum[i] = a[i] + b[i];
    }
}

int main() {
    int *a, *b, *sum;

    // Allocate Memory
    a = (int*)malloc(sizeof(int)*N);
    b = (int*)malloc(sizeof(int)*N);
    sum = (int*)malloc(sizeof(int)*N);

    srand(time(0)); // Seed rand()

    // Initialize the arrays 
    for(int i = 0; i < N; i++) {
        a[i] = rand();
        b[i] = rand();
    }

    addvec(sum,a,b,N);

    for(int i=0; i < 100; i++) {
        if((i%10)==0 && i!=0) printf("\n");
        printf("%d ", (sum[i]-a[i]-b[i]));
    }    
}

0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 


On GPU with 1 Thread

In [8]:
%%cu

#include <bits/stdc++.h>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>

#define N 100000000


__global__ void addvec(int *sum, int *a, int *b, int n) {
    for(int i = 0; i < n; i++) {
        sum[i] = a[i] + b[i];
    }
}

int main() {
    int *a, *b, *sum;

    // Allocate Memory
    a = (int*)malloc(sizeof(int)*N);
    b = (int*)malloc(sizeof(int)*N);
    sum = (int*)malloc(sizeof(int)*N);

    srand(time(0)); // Seed rand()

    // Initialize the arrays 
    for(int i = 0; i < N; i++) {
        a[i] = rand();
        b[i] = rand();
    }

    addvec<<<1,1>>>(sum,a,b,N);

    for(int i=0; i < 100; i++) {
        if((i%10)==0 && i!=0) printf("\n");
        printf("%d ", (sum[i]-a[i]-b[i]));
    }    
}

-1900336617 1218874304 -1968213063 1910910017 -2042127667 1926465564 1684999816 1304094969 -2117003875 -1477387866 
1214081669 -1610799502 -2107658460 1335962836 -1221924173 1714638098 -2108275262 -1141355803 2011056644 -938639566 
631202740 582746946 1228891060 -2118606168 -1905321585 313849285 -2014738790 -1181120331 -1868529198 1693356167 
784351085 -1917746866 2046539370 1042038442 2005418839 -268442307 1091224871 -1489626811 -2093820743 1432003114 
1385707578 1752068699 -1698581467 1691022480 725790872 1932370555 -1797561682 186217778 1250488313 -2135877243 
1331655092 -549922214 -2090014913 -1699455051 794668940 -1740527041 1478400462 -596138344 -1564993623 507675625 
-1963188162 -1476544445 -1840297429 -1364288509 2085413198 -1628869260 -1609518007 1544980956 -2070856852 706566015 
1725557615 -1243295045 -1022419368 -1311190226 -744549254 -1279132096 1636531578 2121447331 -1898519562 1353908680 
-595523283 -691005371 1841215240 -1609606915 -1447982159 -1505829528 -1937001926 133

Why does the above program not work??

Parallelizing

In [9]:
%%cu

#include <bits/stdc++.h>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
#include<cuda.h>

#define N 100000000


__global__ void addvec(int *sum, int *a, int *b, int n) {
    int index = threadIdx.x;
    int stride = blockDim.x;

    for(int i = index; i < n; i += stride) {
        sum[i] = a[i] + b[i];
    }
}

int main() {
    int *a, *b, *sum;
    int *d_a, *d_b, *d_sum;

    // Allocate Memory on host (CPU)
    a = (int*)malloc(sizeof(int)*N);
    b = (int*)malloc(sizeof(int)*N);
    sum = (int*)malloc(sizeof(int)*N);

    srand(time(0)); // Seed rand()

    // Initialize the arrays 
    for(int i = 0; i < N; i++) {
        a[i] = rand();
        b[i] = rand();
    }

    // Allocate Memory on device (GPU) 
    cudaMalloc((void**)&d_a, sizeof(int)*N);
    cudaMalloc((void**)&d_b, sizeof(int)*N);
    cudaMalloc((void**)&d_sum, sizeof(int)*N);

    // Transfer Data from host to device
    cudaMemcpy(d_a, a, sizeof(int)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(int)*N, cudaMemcpyHostToDevice);

    addvec<<<1,256>>>(d_sum,d_a,d_b,N);

    cudaDeviceSynchronize();

    cudaMemcpy(sum, d_sum, sizeof(int)*N, cudaMemcpyDeviceToHost);

    for(int i=0; i < 100; i++) {
        if((i%10)==0 && i!=0) printf("\n");
        printf("%d ", (sum[i]-a[i]-b[i]));
    }

    free(a); free(b); free(sum);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_sum);
}

0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 


Adding more thread blocks

In [10]:
%%cu

#include <bits/stdc++.h>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
#include<cuda.h>

#define N 100000000


__global__ void addvec(int *sum, int *a, int *b, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < n) { sum[tid] = a[tid] + b[tid]; };
}

int main() {
    int *a, *b, *sum;
    int *d_a, *d_b, *d_sum;

    // Allocate Memory on host (CPU)
    a = (int*)malloc(sizeof(int)*N);
    b = (int*)malloc(sizeof(int)*N);
    sum = (int*)malloc(sizeof(int)*N);

    srand(time(0)); // Seed rand()

    // Initialize the arrays 
    for(int i = 0; i < N; i++) {
        a[i] = rand();
        b[i] = rand();
    }

    // Allocate Memory on device (GPU) 
    cudaMalloc((void**)&d_a, sizeof(int)*N);
    cudaMalloc((void**)&d_b, sizeof(int)*N);
    cudaMalloc((void**)&d_sum, sizeof(int)*N);

    // Transfer Data from host to device
    cudaMemcpy(d_a, a, sizeof(int)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(int)*N, cudaMemcpyHostToDevice);

    int block_size = 256;
    int grid_size = ((N + block_size)/block_size);
    
    addvec<<<grid_size,block_size>>>(d_sum,d_a,d_b,N);

    cudaDeviceSynchronize();

    cudaMemcpy(sum, d_sum, sizeof(int)*N, cudaMemcpyDeviceToHost);

    for(int i=0; i < 100; i++) {
        if((i%10)==0 && i!=0) printf("\n");
        printf("%d ", (sum[i]-a[i]-b[i]));
    }

    free(a); free(b); free(sum);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_sum);
}

0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 


In [15]:
%%cu

#include <bits/stdc++.h>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
#include<cuda.h>

#define N 1000000


__global__ void addvec(int *sum, int *a, int *b, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < n) { sum[tid] = a[tid] + b[tid]; };
}

int main() {
    int *a, *b, *sum;

    // Allocating Unified Memory - accesible by host and device
    cudaMallocManaged(&a, sizeof(int)*N);
    cudaMallocManaged(&b, sizeof(int)*N);
    cudaMallocManaged(&sum, sizeof(int)*N);

    srand(time(0)); // Seed rand()

    // Initialize the arrays 
    for(int i = 0; i < N; i++) {
        a[i] = rand();
        b[i] = rand();
    }

    int block_size = 256;
    int grid_size = ((N + block_size)/block_size);

    addvec<<<grid_size,block_size>>>(sum,a,b,N);

    cudaDeviceSynchronize();

    for(int i=0; i < 100; i++) {
        if((i%10)==0 && i!=0) printf("\n");
        printf("%d ", (sum[i]-a[i]-b[i]));
    }

    cudaFree(a); cudaFree(b); cudaFree(sum);
}

0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
