In [6]:
!apt-get install nvidia-cuda-toolkit

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
nvidia-cuda-toolkit is already the newest version (11.5.1-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [7]:
!nvcc --version    # Shows CUDA compiler version
!nvidia-smi        # Shows driver version and compatible CUDA version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Thu Mar 13 06:20:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                       

In [8]:
%%writefile vector_add.cu
#include <stdio.h>

__global__
void vector_addition_kernel(float *A, float *B, float *C, int n){
  int i = blockDim.x * blockIdx.x + threadIdx.x ;
  if (i < n){
    C[i] = A[i] + B[i];
  }
}

__host__
void vector_addition_host(float *A, float *B, float *C, int n){
  // Create the varaiables inside the device and copy the values from host to device
  float *A_d, *B_d, *C_d;
  int size = n * sizeof(float);
  cudaMalloc((void **) &A_d, size);
  cudaMalloc((void **) &B_d, size);
  cudaMalloc((void **) &C_d, size);

  cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);

  // process vector vector_addition
  vector_addition_kernel <<<int(ceil(n/256.0)), 256>>>(A_d, B_d, C_d, n);

  // copy back from device to host and free the memory
  cudaMemcpy(C, C_d, size, cudaMemcpyDeviceToHost);
  cudaFree(A_d);
  cudaFree(B_d);
  cudaFree(C_d);
  for(int i=0; i<5; i++){
    printf("C at %d position is %f\n", i, C[i]);
  }
}
int main(){
  float *A, *B, *C;

  float X[5] = {1, 2, 3, 4, 5};
  float Y[5] = {6, 7, 8, 9, 10};
  float Z[5] = {};

  A = &X[0];
  B = &Y[0];
  C = &Z[0];
  vector_addition_host(A, B, C, 5);
}

Overwriting vector_add.cu


In [9]:
!nvcc -arch=sm_75 vector_add.cu -o vector_add

In [10]:
!./vector_add

C at 0 position is 7.000000
C at 1 position is 9.000000
C at 2 position is 11.000000
C at 3 position is 13.000000
C at 4 position is 15.000000
