In [1]:
!nvidia-smi

Thu May  8 07:48:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
%%writefile matmul.cu


 #include <stdio.h>

__global__ void matmul(float *A, float *B, float *C, int N)
{

    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    if(row < N && col < N)
    {
        float sum = 0;
        for(int k = 0; k < N;k++)
            sum = sum + A[row * N + k] * B[N * k + col];
        C[row * N + col] = sum;
    }

}
int main()
{
  int N = 2;
    size_t size = N * N *sizeof(float);
    float A[] = {1,2,3,4};
    float B[] = {5,6,7,8};
    float C[4];

    float *d_A,*d_B,*d_C;

    cudaMalloc(&d_A,size);
    cudaMalloc(&d_B,size);
    cudaMalloc(&d_C,size);

    cudaMemcpy(d_A,A,size,cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,B,size,cudaMemcpyHostToDevice);

     dim3 blocks(N,N);
    dim3 threads(1,1);

    matmul<<<blocks,threads>>>(d_A,d_B,d_C,N);
    cudaMemcpy(C,d_C,size,cudaMemcpyDeviceToHost);

    for(int i=0;i< N*N;i++)
    {
        printf(" %f",C[i]);
        printf("\n");
    }

}

Overwriting matmul.cu


In [4]:
!nvcc -arch=sm_75 matmul.cu -o matmul

In [5]:
!./matmul

 19.000000
 22.000000
 43.000000
 50.000000


In [7]:
%%writefile add.cu

 #include <stdio.h>

__global__ void add(float *A, float *B, float *C, int N)
{

    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i < N){

      C[i]=A[i]+B[i];
    }

}
int main()
{
  int N = 4;
    size_t size = N *sizeof(float);
    float A[] = {1,2,3,4};
    float B[] = {5,6,7,8};
    float C[4];

    float *d_A,*d_B,*d_C;

    cudaMalloc(&d_A,size);
    cudaMalloc(&d_B,size);
    cudaMalloc(&d_C,size);

    cudaMemcpy(d_A,A,size,cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,B,size,cudaMemcpyHostToDevice);


    add<<<1,N>>>(d_A,d_B,d_C,N);
    cudaMemcpy(C,d_C,size,cudaMemcpyDeviceToHost);

    for(int i=0;i< N;i++)
    {
        printf(" %f",C[i]);
        printf("\n");
    }

}

Overwriting add.cu


In [9]:
!nvcc -arch=sm_75 add.cu -o add

In [10]:
!./add

 6.000000
 8.000000
 10.000000
 12.000000
