In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0


In [2]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-veor47_u
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-veor47_u
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=65610b41f01f86fe90a080ef7ce4e2d76ca659db7678d7c21ae470baabfba7f4
  Stored in directory: /tmp/pip-ephem-wheel-cache-nygx10al/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [4]:
%cd src

/content/src


In [5]:
%%cuda --name core.h
#include<cuda.h>
#include<cuda_runtime.h>

bool cudaCheck(cudaError_t);
void DeviceProp();

cudaEvent_t start, stop;

'File written in /content/src/core.h'

In [7]:
%%cuda --name core.cu
#include <iostream>
using namespace std;

bool cudaCheck(cudaError_t err) {
    if (err != cudaSuccess) {
        cerr << "Code Failed due to " << cudaGetErrorString(err) << endl;
        exit(EXIT_FAILURE);
    }
    return true;   
}

void printProp(cudaDeviceProp devP) {
    cout << "Name: " << devP.name << endl;
    cout << "\tTotal Global Memory: " << devP.totalGlobalMem << endl;
    cout << "\tShared Memory per Block: " << devP.sharedMemPerBlock << endl;
    cout << "\tWarp Size: " << devP.warpSize << endl;
    cout << "\tMax Threads per Block: " << devP.maxThreadsPerBlock << endl;
    for (int i = 0; i < 3; i++) {
        cout << "\tMax of dimension " << i << " of block: " << devP.maxThreadsDim[i] << endl;
    }
    for (int i = 0; i < 3; i++) {
         cout << "\tMax of dimension " << i << " of grid: " << devP.maxGridSize[i] << endl;
    }
    cout << "\tNumber of multiprocessors: " << devP.multiProcessorCount << endl;
}

void DeviceProp() {
    int devCount ;
    cudaGetDeviceCount(&devCount) ;
    for (int i = 0; i < devCount ; ++i) {
        cudaDeviceProp devP;
        cudaGetDeviceProperties(&devP, i);
        printProp(devP);
    }
}

'File written in /content/src/core.cu'

In [230]:
%%cuda --name exp1.cu
#include "core.h"
#include <stdlib.h>
#include <iostream>
using namespace std;

#define RANGE 100

__global__ void kernel1(float *A, float *C, int n) {
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < n) {
        C[id] = A[id];
    }
}

__global__ void kernel2(float *A, float *C, int n, int s = 17) {
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < n) {
        C[id] = s * A[id];
    }
}

__global__ void kernel3(float *A, float *B, float *C, int n) {
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < n) {
        C[id] = A[id] + B[id];
    }
}

__global__ void kernel4(float *A, float *B, float *C, int n) {
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < n) {
        C[id] = sin(A[id]) + cos(B[id]);
    }
}

int main(int argc, char* argv[]) {
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    DeviceProp();
    srand((unsigned)time(0));

    int n = atoi(argv[1]), numThreads = atoi(argv[2]);
    float *A, *B, *C;
    A = (float *)malloc(n * sizeof(float));
    B = (float *)malloc(n * sizeof(float));
    C = (float *)malloc(n * sizeof(float));
    for (int i = 0; i < n; i++) {
        A[i] = (float)rand() * RANGE / RAND_MAX;
        B[i] = (float)rand() * RANGE / RAND_MAX; 
    }
    cout << "Generated " << n << " elements" << endl;

    float *devA, *devB, *devC;
    if (cudaCheck(cudaMalloc((void **)&devA, sizeof(float) * n))) {
        cout << "Allocated device memory for A: " << endl;
    }
    if (cudaCheck(cudaMalloc((void **)&devB, sizeof(float) * n))) {
        cout << "Allocated device memory for B: " << endl;
    }
    if (cudaCheck(cudaMalloc((void **)&devC, sizeof(float) * n))) {
        cout << "Allocated device memory for C: " << endl;
    }

    if (cudaCheck(cudaMemcpy(devA, A, n * sizeof(float), cudaMemcpyHostToDevice))) {
        cout << "Copied array A to device " << endl;
    }
    if (cudaCheck(cudaMemcpy(devB, B, n * sizeof(float), cudaMemcpyHostToDevice))) {
        cout << "Copied array B to device " << endl;
    }

    int blocks = (n + numThreads - 1) / numThreads;
    cudaEventRecord(start);
    kernel4<<< blocks, numThreads >>>(devA, devB, devC, n);
    cudaEventRecord(stop);
    cudaCheck(cudaPeekAtLastError());

    if (cudaCheck(cudaMemcpy(C, devC, n * sizeof(float), cudaMemcpyDeviceToHost))) {
        cout << "Copied result to array C " << endl;
    }
    cudaEventSynchronize(stop);

    float time = 0;
    cudaEventElapsedTime(&time, start, stop);
    cout << "Elapsed Time (in ms): " << time << endl;
}

'File written in /content/src/exp1.cu'

In [231]:
!nvcc exp1.cu core.cu -o exp1

In [287]:
!./exp1 800000 128

Name: Tesla T4
	Total Global Memory: 15843721216
	Shared Memory per Block: 49152
	Warp Size: 32
	Max Threads per Block: 1024
	Max of dimension 0 of block: 1024
	Max of dimension 1 of block: 1024
	Max of dimension 2 of block: 64
	Max of dimension 0 of grid: 2147483647
	Max of dimension 1 of grid: 65535
	Max of dimension 2 of grid: 65535
	Number of multiprocessors: 40
Generated 800000 elements
Allocated device memory for A: 
Allocated device memory for B: 
Allocated device memory for C: 
Copied array A to device 
Copied array B to device 
Copied result to array C 
Elapsed Time (in ms): 0.06176
