In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!nvidia-smi

Sat Mar  8 19:12:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [3]:
%%writefile hello.cu
#include<stdio.h>

__global__ void hello_cuda()
{
    printf("Hello from Cuda Kernel\n");
}
int main()
{
    hello_cuda<<<1,1>>>();
    cudaDeviceSynchronize();
    return 0;
}

Writing hello.cu


In [4]:
!nvcc hello.cu -o hello
!./hello

Hello from Cuda Kernel


In [5]:
%%writefile hello_multi.cu
#include<stdio.h>

__global__ void hello_cuda()
{
    int threadId=threadIdx.x+blockIdx.x*blockDim.x;
    printf("Hello from thread--> %d \n",threadId);
}
int main()
{
    hello_cuda<<<2,4>>>();
    cudaDeviceSynchronize();
    return 0;
}

Writing hello_multi.cu


In [6]:
!nvcc hello_multi.cu -o hello_multi
!./hello_multi

Hello from thread--> 0 
Hello from thread--> 1 
Hello from thread--> 2 
Hello from thread--> 3 
Hello from thread--> 4 
Hello from thread--> 5 
Hello from thread--> 6 
Hello from thread--> 7 


***Vector Addition in CUDA***

In [7]:
%%writefile vector_add.cu
#include<stdio.h>
#include<cuda_runtime.h>
#define N 1000
__global__ void vectorAdd(int *A, int *B, int *C, int size)
{
    int idx=threadIdx.x+blockIdx.x*blockDim.x;
    if(idx<size)
    {
        C[idx]=A[idx]+B[idx];
    }
}
int main()
{
    int *h_A,*h_B,*h_C; //Host Vectors on CPU
    int  *d_A,*d_B,*d_C;

    size_t bytes=N*sizeof(int); //Mem Size

    //Allocate mem on CPU
    h_A=(int*)malloc(bytes);
    h_B=(int*)malloc(bytes);
    h_C=(int*)malloc(bytes);
    
    for(int i=0;i<N;i++)
    {
       h_A[i]=i;
       h_B[i]=i*2;
    }
    cudaMalloc((void**)&d_A, bytes);  //allocate on GPU
    cudaMalloc((void**)&d_B, bytes);
    cudaMalloc((void**)&d_C, bytes);

    cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice); //Copy data from host to device
    cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);

    int threadsperblock=256;
    int blockspergrid=(N+threadsperblock-1)/threadsperblock;
    vectorAdd<<<blockspergrid,threadsperblock>>>(d_A,d_B,d_C,N);

    cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost);  // Copy result back to host 

    printf("Sample results: \n");
    for (int i = 0; i < 10; i++) {
        printf("%d + %d = %d\n", h_A[i], h_B[i], h_C[i]);
    }
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Writing vector_add.cu


In [8]:
!nvcc vector_add.cu -o vector_add
!./vector_add

Sample results: 
0 + 0 = 0
1 + 2 = 3
2 + 4 = 6
3 + 6 = 9
4 + 8 = 12
5 + 10 = 15
6 + 12 = 18
7 + 14 = 21
8 + 16 = 24
9 + 18 = 27


***Matrix Multiplication using CUDA Kernels***

In [9]:
%%writefile matrix_multiplication.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 3 

__global__ void matrixMul(int *A, int *B, int *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x; 

    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; k++) {
            sum += A[row * n + k] * B[k * n + col]; // Compute dot product
        }
        C[row * n + col] = sum;
    }
}

int main() {
    int h_A[N * N], h_B[N * N], h_C[N * N]; // Host matrices
    int *d_A, *d_B, *d_C; // Device matrices

    size_t bytes = N * N * sizeof(int); // Memory size

    // Initialize matrices A and B
    printf("Matrix A:\n");
    for (int i = 0; i < N * N; i++) {
        h_A[i] = i + 1;
        h_B[i] = (i + 1) * 2;
        printf("%d ", h_A[i]);
        if ((i + 1) % N == 0) printf("\n");
    }

    printf("\nMatrix B:\n");
    for (int i = 0; i < N * N; i++) {
        printf("%d ", h_B[i]);
        if ((i + 1) % N == 0) printf("\n");
    }

    // Allocate memory on GPU
    cudaMalloc((void**)&d_A, bytes);
    cudaMalloc((void**)&d_B, bytes);
    cudaMalloc((void**)&d_C, bytes);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);

    // Define thread and block sizes
    dim3 threadsPerBlock(16, 16); // 16x16 threads per block
    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (N + threadsPerBlock.y - 1) / threadsPerBlock.y); // Grid size

    // Launch kernel
    matrixMul<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy result back to host
    cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost);

    // Print result
    printf("\nResult Matrix C:\n");
    for (int i = 0; i < N * N; i++) {
        printf("%d ", h_C[i]);
        if ((i + 1) % N == 0) printf("\n");
    }

    // Free memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Writing matrix_multiplication.cu


In [10]:
!nvcc matrix_multiplication.cu -o matrix_mul
!./matrix_mul

Matrix A:
1 2 3 
4 5 6 
7 8 9 

Matrix B:
2 4 6 
8 10 12 
14 16 18 

Result Matrix C:
60 72 84 
132 162 192 
204 252 300 
