### **CUDA** 
Below is an example that runs native CUDA code. 

1.   We investigate the CUDA version, drivers and the avaiable GPU with nvidia-smi and nvcc-version
2.   We use the IPython magic command "%%writefile filename" to save a *.cu program
3.   We then compile and run the *.cu program with nvcc







In [1]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
Mon Jan 16 08:00:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+------


## Next, we write a native CUDA code and save it as 'vectorAdd.cu'


In [2]:
%%writefile vectorAdd.cu
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
// host copies of variables a, b & c
int *d_a, *d_b, *d_c;
// device copies of variables a, b & c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values  
c = 0;
a = 3;
b = 5;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
printf("result is %d\n",c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}

Writing vectorAdd.cu


## We compile the saved cuda code using nvcc compiler

In [3]:
!nvcc vectorAdd.cu -o vectorAdd
!ls


sample_data  vectorAdd	vectorAdd.cu


## Finally, we execute the binary of the compiled code

In [4]:
!./vectorAdd

result is 8


In [5]:
!nvcc -arch=sm_75 -I/usr/local/cuda/samples/common/inc lab3_ex2_template.cu -o vecmul

In [6]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 32 32 32 32

Input matrix dim (32 x 32) (32 x 32) (32 x 32)
==PROF== Connected to process 2586 (/content/vecmul)
Transfer time host to device 0.000045 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.776861 seconds
Transfer Time device to host 0.000049 seconds
==PROF== Disconnected from process 2586
[2586] vecmul@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2023-Jan-16 08:07:03, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.96
    SM Frequency                                                             cycle/usecond                         580.71
    Elapsed Cycles                                                                   cycle                         20,349
    Memory [%]                        

In [7]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 128 128 128 128

Input matrix dim (128 x 128) (128 x 128) (128 x 128)
==PROF== Connected to process 3466 (/content/vecmul)
Transfer time host to device 0.000117 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.668961 seconds
Transfer Time device to host 0.000125 seconds
==PROF== Disconnected from process 3466
[3466] vecmul@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2023-Jan-16 08:10:35, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         585.71
    Elapsed Cycles                                                                   cycle                         71,413
    Memory [%]                  

In [8]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 511 1023 1023 4094

Input matrix dim (511 x 1023) (1023 x 4094) (511 x 4094)
==PROF== Connected to process 3616 (/content/vecmul)
Transfer time host to device 0.008077 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 1.039614 seconds
Transfer Time device to host 0.010823 seconds
==PROF== Disconnected from process 3616
[3616] vecmul@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2023-Jan-16 08:11:37, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.99
    SM Frequency                                                             cycle/usecond                         584.87
    Elapsed Cycles                                                                   cycle                     28,111,915
    Memory [%]              

In [10]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 1024 2048 2048 4096

Input matrix dim (1024 x 2048) (2048 x 4096) (1024 x 4096)
==PROF== Connected to process 4737 (/content/vecmul)
Transfer time host to device 0.018661 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 2.194466 seconds
Transfer Time device to host 0.021263 seconds
==PROF== Disconnected from process 4737
[4737] vecmul@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2023-Jan-16 08:20:25, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         584.99
    Elapsed Cycles                                                                   cycle                    111,637,768
    Memory [%]            

In [11]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 256 256 256 256

Input matrix dim (256 x 256) (256 x 256) (256 x 256)
==PROF== Connected to process 6269 (/content/vecmul)
Transfer time host to device 0.000326 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.640931 seconds
Transfer Time device to host 0.000403 seconds
==PROF== Disconnected from process 6269
[6269] vecmul@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2023-Jan-16 08:21:36, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.99
    SM Frequency                                                             cycle/usecond                         584.75
    Elapsed Cycles                                                                   cycle                        277,878
    Memory [%]                  

In [12]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 512 512 512 512

Input matrix dim (512 x 512) (512 x 512) (512 x 512)
==PROF== Connected to process 6365 (/content/vecmul)
Transfer time host to device 0.001008 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.649788 seconds
Transfer Time device to host 0.001399 seconds
==PROF== Disconnected from process 6365
[6365] vecmul@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2023-Jan-16 08:21:56, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.99
    SM Frequency                                                             cycle/usecond                         584.84
    Elapsed Cycles                                                                   cycle                      1,911,386
    Memory [%]                  

In [13]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 1024 1024 1024 1024

Input matrix dim (1024 x 1024) (1024 x 1024) (1024 x 1024)
==PROF== Connected to process 6473 (/content/vecmul)
Transfer time host to device 0.003775 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.825443 seconds
Transfer Time device to host 0.005415 seconds
==PROF== Disconnected from process 6473
[6473] vecmul@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2023-Jan-16 08:22:28, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         584.91
    Elapsed Cycles                                                                   cycle                     14,119,300
    Memory [%]            

In [14]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 2048 2048 2048 2048

Input matrix dim (2048 x 2048) (2048 x 2048) (2048 x 2048)
==PROF== Connected to process 6631 (/content/vecmul)
Transfer time host to device 0.015762 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 2.196003 seconds
Transfer Time device to host 0.026331 seconds
==PROF== Disconnected from process 6631
[6631] vecmul@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2023-Jan-16 08:27:05, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         585.00
    Elapsed Cycles                                                                   cycle                    111,632,411
    Memory [%]            