### **CUDA** 
Below is an example that runs native CUDA code. 

1.   We investigate the CUDA version, drivers and the avaiable GPU with nvidia-smi and nvcc-version
2.   We use the IPython magic command "%%writefile filename" to save a *.cu program
3.   We then compile and run the *.cu program with nvcc







In [None]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
Mon Jan 16 08:00:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+------


## Next, we write a native CUDA code and save it as 'vectorAdd.cu'


In [None]:
%%writefile vectorAdd.cu
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
// host copies of variables a, b & c
int *d_a, *d_b, *d_c;
// device copies of variables a, b & c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values  
c = 0;
a = 3;
b = 5;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
printf("result is %d\n",c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}

Writing vectorAdd.cu


## We compile the saved cuda code using nvcc compiler

In [None]:
!nvcc vectorAdd.cu -o vectorAdd
!ls


sample_data  vectorAdd	vectorAdd.cu


## Finally, we execute the binary of the compiled code

In [None]:
!./vectorAdd

result is 8


In [5]:
!nvcc -arch=sm_75 -I/usr/local/cuda/samples/common/inc vecmul.cu -o vecmul

In [9]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 32 32 32 32

Input matrix dim (32 x 32) (32 x 32) (32 x 32)
==PROF== Connected to process 1109 (/content/vecmul)
Transfer time host to device 0.000036 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.695777 seconds
Transfer Time device to host 0.000046 seconds
==PROF== Disconnected from process 1109
[1109] vecmul@127.0.0.1
  gemm(float*, float*, float*, int, int, int, int), 2023-Jan-16 19:13:14, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.94
    SM Frequency                                                             cycle/usecond                         579.76
    Elapsed Cycles                                                                   cycle                          6,791
    Memory [%]                           

In [7]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 128 128 128 128

Input matrix dim (128 x 128) (128 x 128) (128 x 128)
==PROF== Connected to process 964 (/content/vecmul)
Transfer time host to device 0.000094 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.637157 seconds
Transfer Time device to host 0.000121 seconds
==PROF== Disconnected from process 964
[964] vecmul@127.0.0.1
  gemm(float*, float*, float*, int, int, int, int), 2023-Jan-16 19:12:50, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         585.15
    Elapsed Cycles                                                                   cycle                         20,248
    Memory [%]                        

In [11]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 511 1023 1023 4094

Input matrix dim (511 x 1023) (1023 x 4094) (511 x 4094)
==PROF== Connected to process 2219 (/content/vecmul)
Transfer time host to device 0.004398 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.761530 seconds
Transfer Time device to host 0.005504 seconds
==PROF== Disconnected from process 2219
[2219] vecmul@127.0.0.1
  gemm(float*, float*, float*, int, int, int, int), 2023-Jan-16 19:17:50, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         585.00
    Elapsed Cycles                                                                   cycle                      7,948,216
    Memory [%]                 

In [10]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 1024 2048 2048 4096

Input matrix dim (1024 x 2048) (2048 x 4096) (1024 x 4096)
==PROF== Connected to process 1197 (/content/vecmul)
Transfer time host to device 0.008818 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 1.056773 seconds
Transfer Time device to host 0.010558 seconds
==PROF== Disconnected from process 1197
[1197] vecmul@127.0.0.1
  gemm(float*, float*, float*, int, int, int, int), 2023-Jan-16 19:17:13, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         585.05
    Elapsed Cycles                                                                   cycle                     31,018,086
    Memory [%]               

In [12]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 256 256 256 256

Input matrix dim (256 x 256) (256 x 256) (256 x 256)
==PROF== Connected to process 2323 (/content/vecmul)
Transfer time host to device 0.000175 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.683763 seconds
Transfer Time device to host 0.000256 seconds
==PROF== Disconnected from process 2323
[2323] vecmul@127.0.0.1
  gemm(float*, float*, float*, int, int, int, int), 2023-Jan-16 19:17:52, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.98
    SM Frequency                                                             cycle/usecond                         582.60
    Elapsed Cycles                                                                   cycle                         76,872
    Memory [%]                     

In [13]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 512 512 512 512

Input matrix dim (512 x 512) (512 x 512) (512 x 512)
==PROF== Connected to process 2347 (/content/vecmul)
Transfer time host to device 0.000556 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.679272 seconds
Transfer Time device to host 0.000802 seconds
==PROF== Disconnected from process 2347
[2347] vecmul@127.0.0.1
  gemm(float*, float*, float*, int, int, int, int), 2023-Jan-16 19:17:54, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         585.25
    Elapsed Cycles                                                                   cycle                        516,879
    Memory [%]                     

In [14]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 1024 1024 1024 1024

Input matrix dim (1024 x 1024) (1024 x 1024) (1024 x 1024)
==PROF== Connected to process 2379 (/content/vecmul)
Transfer time host to device 0.002058 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 0.704722 seconds
Transfer Time device to host 0.002732 seconds
==PROF== Disconnected from process 2379
[2379] vecmul@127.0.0.1
  gemm(float*, float*, float*, int, int, int, int), 2023-Jan-16 19:18:08, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         584.88
    Elapsed Cycles                                                                   cycle                      3,897,031
    Memory [%]               

In [15]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./vecmul 2048 2048 2048 2048

Input matrix dim (2048 x 2048) (2048 x 2048) (2048 x 2048)
==PROF== Connected to process 2459 (/content/vecmul)
Transfer time host to device 0.007310 seconds
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
Kernel Time 1.060314 seconds
Transfer Time device to host 0.010693 seconds
==PROF== Disconnected from process 2459
[2459] vecmul@127.0.0.1
  gemm(float*, float*, float*, int, int, int, int), 2023-Jan-16 19:20:10, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         585.30
    Elapsed Cycles                                                                   cycle                     31,022,873
    Memory [%]               