Practical 4

Write a CUDA Program for :
1. Addition of two large vectors
2. Matrix Multiplication using CUDA C

# 1. Vector Addition Using CUDA C

In [None]:
# Install pycuda if not already installed
!pip install -q pycuda

import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule

# Define the CUDA kernel for vector addition
mod = SourceModule("""
__global__ void vector_add(int *A, int *B, int *C, int n)
{
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < n) {
        C[idx] = A[idx] + B[idx];
    }
}
""")

# Function to perform vector addition
def vector_add_cuda(A, B):
    n = len(A)

    # Allocate memory on the device
    A_gpu = cuda.mem_alloc(A.nbytes)
    B_gpu = cuda.mem_alloc(B.nbytes)
    C_gpu = cuda.mem_alloc(A.nbytes)  # Result vector

    # Copy data to the device
    cuda.memcpy_htod(A_gpu, A)
    cuda.memcpy_htod(B_gpu, B)

    # Get the CUDA function
    vector_add = mod.get_function("vector_add")

    # Set up thread block and grid dimensions
    threads_per_block = 256
    blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

    # Launch the kernel
    vector_add(A_gpu, B_gpu, C_gpu, np.int32(n), block=(threads_per_block, 1, 1), grid=(blocks_per_grid, 1))

    # Copy result from device to host
    C = np.empty_like(A)
    cuda.memcpy_dtoh(C, C_gpu)

    return C

# Function to safely read integer input
def safe_input_int(prompt):
    while True:
        try:
            n = int(input(prompt))
            if n <= 0:
                print("⚠️  Size must be positive.")
                continue
            return n
        except ValueError:
            print("⚠️  Invalid input. Please enter a single integer.")

# Function to safely read vector elements
def safe_input_vector(prompt, n):
    while True:
        try:
            elements = list(map(int, input(prompt).split()))
            if len(elements) != n:
                print(f"⚠️  You entered {len(elements)} elements, but {n} expected. Try again.")
                continue
            return np.array(elements, dtype=np.int32)
        except ValueError:
            print("⚠️  Invalid input. Please enter integers separated by spaces.")

# Main program
n = safe_input_int("Enter size of vectors: ")

A = safe_input_vector(f"Enter {n} elements of vector A (space-separated): ", n)
B = safe_input_vector(f"Enter {n} elements of vector B (space-separated): ", n)

# Perform vector addition on GPU
C = vector_add_cuda(A, B)

# Displaying addition steps
print("\nAddition Steps:")
for i in range(n):
    print(f"C[{i}] = {A[i]} + {B[i]} = {C[i]}")

# Final Resultant Vector
print("\nResultant vector C:")
print(*C)


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pycuda (pyproject.toml) ... [?25l[?25hdone
Enter size of vectors: 2
Enter 2 elements of vector A (space-separated): 1 2
Enter 2 elements of vector B (space-separated): 3 4

Addition Steps:
C[0] = 1 + 3 = 4
C[1] = 2 + 4 = 6

Resultant vector C:
4 6


# 2. Matrix Multiplication using CUDA (PyCUDA in Colab)

In [None]:
!pip install pycuda
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule

# CUDA kernel for matrix multiplication
mod = SourceModule("""
__global__ void matmul(int *A, int *B, int *C, int N)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if(row < N && col < N) {
        int temp = 0;
        for(int i = 0; i < N; i++) {
            temp += A[row * N + i] * B[i * N + col];
        }
        C[row * N + col] = temp;
    }
}
""")

# Function to perform matrix multiplication on GPU
def matrix_multiply_cuda(A, B, N):
    # Flatten matrices to 1D arrays
    A_flat = A.flatten().astype(np.int32)
    B_flat = B.flatten().astype(np.int32)
    C_flat = np.zeros_like(A_flat)

    # Allocate device memory
    A_gpu = cuda.mem_alloc(A_flat.nbytes)
    B_gpu = cuda.mem_alloc(B_flat.nbytes)
    C_gpu = cuda.mem_alloc(C_flat.nbytes)

    # Copy data to device memory
    cuda.memcpy_htod(A_gpu, A_flat)
    cuda.memcpy_htod(B_gpu, B_flat)

    # Get the CUDA function
    matmul = mod.get_function("matmul")

    # Define grid and block dimensions
    block_dim = (16, 16, 1)
    grid_dim = (int(np.ceil(N / block_dim[0])), int(np.ceil(N / block_dim[1])), 1)

    # Launch the kernel (Matrix multiplication)
    matmul(A_gpu, B_gpu, C_gpu, np.int32(N), block=block_dim, grid=grid_dim)

    # Copy result from device to host
    cuda.memcpy_dtoh(C_flat, C_gpu)

    # Reshape result back to a 2D matrix
    C = C_flat.reshape((N, N))
    return C

# Input size and matrices
N = int(input("Enter the size of the matrices (NxN): "))
print(f"Enter elements for matrix A ({N}x{N}):")
A = np.array([list(map(int, input().split())) for _ in range(N)], dtype=np.int32)

print(f"Enter elements for matrix B ({N}x{N}):")
B = np.array([list(map(int, input().split())) for _ in range(N)], dtype=np.int32)

# Perform matrix multiplication on GPU
C = matrix_multiply_cuda(A, B, N)

# Display matrices and result
print("\nMatrix A:")
print(A)
print("\nMatrix B:")
print(B)
print("\nResultant Matrix C:")
print(C)




  globals().clear()


Enter the size of the matrices (NxN): 2
Enter elements for matrix A (2x2):
1 2
3 4
Enter elements for matrix B (2x2):
5 6
7 8

Matrix A:
[[1 2]
 [3 4]]

Matrix B:
[[5 6]
 [7 8]]

Resultant Matrix C:
[[19 22]
 [43 50]]
