<a href="https://colab.research.google.com/github/SiddhiMane/Sem-8/blob/main/HPC_Prac4_cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. You are using GPU.")
else:
    print("CUDA is not available. You are using CPU.")


CUDA is available. You are using GPU.


In [14]:
%%writefile vector_addition.cu
#include <stdio.h>

#define N 1000000 // Size of vectors

// Kernel function to add two vectors
__global__ void vectorAddition(float *a, float *b, float *c) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < N)
        c[tid] = a[tid] + b[tid];
}

int main() {
    float *a, *b, *c; // Host vectors
    float *d_a, *d_b, *d_c; // Device vectors
    int size = N * sizeof(float);

    // Allocate memory on host
    a = (float*)malloc(size);
    b = (float*)malloc(size);
    c = (float*)malloc(size);

    // Initialize vectors on host
    for (int i = 0; i < N; i++) {
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    // Allocate memory on device
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    // Copy vectors from host to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Define block and grid dimensions
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Launch vector addition kernel
    vectorAddition<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c);

    // Copy result from device to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    // Verify results
    for (int i = 0; i < 10; i++) {
        printf("%f + %f = %f\n", a[i], b[i], c[i]);
    }

    // Free memory
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}


Overwriting vector_addition.cu


In [15]:
!nvcc -o vector_addition vector_addition.cu
!./vector_addition


1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000


In [16]:
%%writefile matrix_multiplication.cu
#include <stdio.h>

#define N 1024 // Size of matrices (N x N)

// Kernel function to perform matrix multiplication
__global__ void matrixMultiplication(float *a, float *b, float *c, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    if (row < n && col < n) {
        for (int i = 0; i < n; i++) {
            sum += a[row * n + i] * b[i * n + col];
        }
        c[row * n + col] = sum;
    }
}

int main() {
    float *a, *b, *c; // Host matrices
    float *d_a, *d_b, *d_c; // Device matrices
    int size = N * N * sizeof(float);

    // Allocate memory on host
    a = (float*)malloc(size);
    b = (float*)malloc(size);
    c = (float*)malloc(size);

    // Initialize matrices on host
    for (int i = 0; i < N * N; i++) {
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    // Allocate memory on device
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    // Copy matrices from host to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Define block and grid dimensions
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch matrix multiplication kernel
    matrixMultiplication<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    // Copy result from device to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    // Verify results (print a few elements)
    for (int i = 0; i < 10; i++) {
        printf("%f\n", c[i]);
    }

    // Free memory
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}


Writing matrix_multiplication.cu


In [17]:
!nvcc -o matrix_multiplication matrix_multiplication.cu
!./matrix_multiplication


2048.000000
2048.000000
2048.000000
2048.000000
2048.000000
2048.000000
2048.000000
2048.000000
2048.000000
2048.000000
