### **CUDA** 
Below is an example that runs native CUDA code. 

1.   We investigate the CUDA version, drivers and the avaiable GPU with nvidia-smi and nvcc-version
2.   We use the IPython magic command "%%writefile filename" to save a *.cu program
3.   We then compile and run the *.cu program with nvcc







In [1]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
Mon Jan 16 15:10:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8    12W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+------


## Next, we write a native CUDA code and save it as 'vectorAdd.cu'


In [2]:
%%writefile vectorAdd.cu
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
// host copies of variables a, b & c
int *d_a, *d_b, *d_c;
// device copies of variables a, b & c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values  
c = 0;
a = 3;
b = 5;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
printf("result is %d\n",c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}

Writing vectorAdd.cu


## We compile the saved cuda code using nvcc compiler

In [3]:
!nvcc vectorAdd.cu -o vectorAdd
!ls


sample_data  vectorAdd	vectorAdd.cu


## Finally, we execute the binary of the compiled code

In [4]:
!./vectorAdd

result is 8


In [19]:
!nvcc -arch=sm_75 -I/usr/local/cuda/samples/common/inc lab3_ex3_template.cu -o histo

In [20]:
!./histo 1024

The input length is 1024
3, 1, 1, 0, 3, 1, 2, 2, 0, 2, 0, 1, 2, 2, 3, 2, 2, 1, 0, 2, 3, 1, 1, 2, 0, 1, 1, 1, 2, 1, 0, 1, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 0, 2, 0, 1, 2, 0, 2, 1, 0, 2, 0, 1, 1, 0, 2, 0, 1, 0, 1, 1, 3, 0, 1, 1, 1, 1, 3, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2, 1, 0, 2, 0, 2, 0, 1, 0, 1, 0, 0, 2, 0, 0, 1, 1, 1, 1, 1, 2, 0, 0, 2, 1, 0, 1, 2, 1, 0, 0, 0, 2, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2, 1, 0, 3, 0, 3, 1, 4, 1, 0, 1, 2, 2, 2, 1, 1, 1, 1, 3, 1, 2, 0, 2, 2, 1, 4, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 1, 2, 1, 0, 2, 1, 1, 2, 0, 0, 1, 3, 2, 0, 0, 2, 0, 0, 3, 1, 2, 2, 0, 1, 0, 3, 1, 1, 1, 1, 3, 1, 2, 1, 0, 1, 2, 1, 1, 2, 2, 1, 0, 1, 1, 4, 3, 2, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 2, 1, 0, 0, 1, 0, 4, 1, 0, 2, 1, 0, 1, 0, 1, 1, 1, 2, 0, 1, 3, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 1, 1, 1, 3, 0, 0, 0, 3, 0, 4, 0, 1, 2, 0, 2, 1, 0, 1, 1, 0, 0, 1, 3, 1, 0, 0, 3, 2, 0, 0, 0, 

In [21]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./histo 1024

The input length is 1024
==PROF== Connected to process 2170 (/content/histo)
==PROF== Profiling "histogram_kernel" - 1: 0%....50%....100% - 8 passes
==PROF== Profiling "convert_kernel" - 2: 0%....50%....100% - 8 passes
1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 1, 4, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1, 3, 0, 0, 1, 1, 2, 0, 0, 1, 0, 2, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 2, 2, 1, 0, 1, 1, 0, 1, 0, 1, 1, 2, 0, 1, 2, 1, 0, 0, 1, 0, 1, 2, 1, 0, 1, 2, 1, 0, 1, 1, 3, 1, 1, 0, 0, 1, 1, 3, 2, 0, 1, 1, 1, 0, 2, 1, 2, 2, 4, 2, 5, 0, 0, 1, 1, 0, 0, 1, 1, 2, 0, 0, 2, 1, 1, 0, 3, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 2, 2, 3, 3, 0, 1, 1, 0, 0, 1, 0, 0, 1, 2, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1, 0, 1, 0, 0, 2, 0, 0, 1, 0, 1, 1, 3, 1, 0, 3, 0, 0, 1, 1, 1, 2, 0, 2, 2, 0, 1, 0, 3, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 3, 1, 2, 4, 1, 1, 1, 0, 2, 0, 1, 1, 1, 1, 1, 2, 0, 2, 1, 0, 1, 1, 1, 1, 2, 2, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 1, 0, 3, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 4, 0, 1, 1, 0, 0, 2, 0, 1, 0, 2, 0, 0