<a href="https://colab.research.google.com/github/RaoEhsanElahi/Parallel_Processing/blob/main/cuda_c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-6jx4gnpk
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-6jx4gnpk
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4293 sha256=2e118a53f7de5985522ca27a9ea611dd5d59f377a628fdb086220a5657434bc9
  Stored in directory: /tmp/pip-ephem-wheel-cache-65hfkz2s/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin


created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void multiply(int **a, int **b, int **c, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if (i < N && j < N) {
        int sum = 0;
        for (int k = 0; k < N; k++) {
            sum += a[i][k] * b[k][j];
        }
        c[i][j] = sum;
    }
}

int main() {
    int N = 512;
    int **a = new int*[N];
    int **b = new int*[N];
    int **c = new int*[N];
    for (int i = 0; i < N; i++) {
        a[i] = new int[N];
        b[i] = new int[N];
        c[i] = new int[N];
        for (int j = 0; j < N; j++) {
            a[i][j] = i + j;
            b[i][j] = i - j;
        }
    }

    int blockSize = 16;
    dim3 blockDim(blockSize, blockSize);
    dim3 gridDim((N + blockSize - 1) / blockSize, (N + blockSize - 1) / blockSize);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    multiply<<<gridDim, blockDim>>>(a, b, c, N);
    cudaDeviceSynchronize();

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("GPU Time: %f ms\n", milliseconds);

    for (int i = 0; i < N; i++) {
      printf("\n");
      for (int j = 0; j < N; j++) {
          printf("%d",c[i][j]);
        }
    }

    return 0;
}

#Matrix Multiplication CPU vs CUDA

In [5]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Matrix multiplication without CUDA
void matrix_multiply_cpu(float *a, float *b, float *result, int size) {
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            result[i * size + j] = 0;
            for (int k = 0; k < size; k++) {
                result[i * size + j] += a[i * size + k] * b[k * size + j];
            }
        }
    }
}

// Matrix multiplication with CUDA
__global__
void matrix_multiply_cuda(float *a, float *b, float *result, int size) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < size && col < size) {
        float value = 0.0f;
        for (int k = 0; k < size; k++) {
            value += a[row * size + k] * b[k * size + col];
        }
        result[row * size + col] = value;
    }
}

int main() {
    // Set the matrix dimensions
    int matrix_size = 1000;

    // Allocate memory for matrices
    float *matrix_a, *matrix_b, *result_cpu, *result_cuda;
    size_t matrix_bytes = matrix_size * matrix_size * sizeof(float);

    matrix_a = (float *)malloc(matrix_bytes);
    matrix_b = (float *)malloc(matrix_bytes);
    result_cpu = (float *)malloc(matrix_bytes);
    result_cuda = (float *)malloc(matrix_bytes);

    // Initialize matrices with random values
    srand(time(NULL));
    for (int i = 0; i < matrix_size * matrix_size; i++) {
        matrix_a[i] = (float)rand() / RAND_MAX;
        matrix_b[i] = (float)rand() / RAND_MAX;
    }

    // Without CUDA
    clock_t start_cpu = clock();
    matrix_multiply_cpu(matrix_a, matrix_b, result_cpu, matrix_size);
    clock_t end_cpu = clock();
    double time_cpu = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC;

    // With CUDA
    float *dev_matrix_a, *dev_matrix_b, *dev_result_cuda;
    cudaMalloc((void **)&dev_matrix_a, matrix_bytes);
    cudaMalloc((void **)&dev_matrix_b, matrix_bytes);
    cudaMalloc((void **)&dev_result_cuda, matrix_bytes);

    cudaMemcpy(dev_matrix_a, matrix_a, matrix_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_matrix_b, matrix_b, matrix_bytes, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((matrix_size + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (matrix_size + threadsPerBlock.y - 1) / threadsPerBlock.y);

    clock_t start_cuda = clock();
    matrix_multiply_cuda<<<blocksPerGrid, threadsPerBlock>>>(dev_matrix_a, dev_matrix_b, dev_result_cuda, matrix_size);
    cudaDeviceSynchronize();
    clock_t end_cuda = clock();
    double time_cuda = ((double)(end_cuda - start_cuda)) / CLOCKS_PER_SEC;

    cudaMemcpy(result_cuda, dev_result_cuda, matrix_bytes, cudaMemcpyDeviceToHost);

    // Print execution times
    printf("Time without CUDA: %f seconds\n", time_cpu);
    printf("Time with CUDA: %f seconds\n", time_cuda);

    // Cleanup
    free(matrix_a);
    free(matrix_b);
    free(result_cpu);
    free(result_cuda);
    cudaFree(dev_matrix_a);
    cudaFree(dev_matrix_b);
    cudaFree(dev_result_cuda);

    return 0;
}


Time without CUDA: 8.382830 seconds
Time with CUDA: 0.064097 seconds



# **Vector** Addition

In [None]:
%%cu
#include <stdio.h>

__global__
void vectorAddition(int *a, int *b, int *c, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    const int size = 1024;  // Adjust the size as needed
    const int block_size = 256;

    // Host arrays
    int *h_a, *h_b, *h_c;
    h_a = (int *)malloc(size * sizeof(int));
    h_b = (int *)malloc(size * sizeof(int));
    h_c = (int *)malloc(size * sizeof(int));

    // Initialize host arrays
    for (int i = 0; i < size; i++) {
        h_a[i] = i;
        h_b[i] = i * 2;
    }

    // Device arrays
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, size * sizeof(int));
    cudaMalloc((void **)&d_b, size * sizeof(int));
    cudaMalloc((void **)&d_c, size * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_a, h_a, size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size * sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    vectorAddition<<<(size + block_size - 1) / block_size, block_size>>>(d_a, d_b, d_c, size);

    // Copy result from device to host
    cudaMemcpy(h_c, d_c, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Print result
    for (int i = 0; i < size; i++) {
        printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
    }

    // Cleanup
    free(h_a);
    free(h_b);
    free(h_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


#Simple Matrix Multiplication

In [9]:
%%cu
#include <stdio.h>

#define N 4

__global__
void matrixMultiply(int *a, int *b, int *c) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int sum = 0;
        for (int i = 0; i < N; i++) {
            sum += a[row * N + i] * b[i * N + col];
        }
        c[row * N + col] = sum;
    }
}

int main() {
    const int size = N * N;
    const int block_size = 2;

    // Host arrays
    int h_a[size], h_b[size], h_c[size];

    // Initialize host arrays
    for (int i = 0; i < size; i++) {
        h_a[i] = i;
        h_b[i] = i * 2;
    }

    // Device arrays
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, size * sizeof(int));
    cudaMalloc((void **)&d_b, size * sizeof(int));
    cudaMalloc((void **)&d_c, size * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_a, h_a, size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size * sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    dim3 threadsPerBlock(block_size, block_size);
    dim3 blocksPerGrid((N + block_size - 1) / block_size, (N + block_size - 1) / block_size);
    matrixMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c);

    // Copy result from device to host
    cudaMemcpy(h_c, d_c, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Print result
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d\t", h_c[i * N + j]);
        }
        printf("\n");
    }

    // Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


112	124	136	148	
304	348	392	436	
496	572	648	724	
688	796	904	1012	

