<a href="https://colab.research.google.com/github/SouvikBandyopadhyay/CUDA_min_max_sum_matrixmul/blob/main/cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code to Initialize CUDA development environment on Colab Notebook

In [None]:
!ls /usr/local/

bin    cuda	cuda-12.2  games	       include	lib64	   man	 share
colab  cuda-12	etc	   _gcs_config_ops.so  lib	licensing  sbin  src


In [None]:
!which nvcc

/usr/bin/nvcc


In [None]:
!nvidia-smi

Sat Dec 16 07:49:09 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!apt install nvidia-cuda-toolkit


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
nvidia-cuda-toolkit is already the newest version (11.5.1-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


# This code finds the minimum and maximum value from an array using CUDA.
# Kernel `min_max_kernel`:
**1. Thread Index Calculation:**
* `int tid = threadIdx.x + blockIdx.x * blockDim.x;` calculates the thread ID within the grid.

**2. Local Min and Max Initialization:**
* `int local_min = arr[0];` and `int local_max = arr[0];` initialize local variables with the first element of the array.

**3. Find Local Min and Max:**
* The kernel iterates through the array elements assigned to each thread.
* Each thread checks a specific range of elements (`i`) within the array (`arr`) using the thread ID and grid dimensions.
* For each element, it updates `local_min` and `local_max` if a smaller or larger value is found, respectively.

**4. Atomic Operations:**
* `atomicMin(min_val, local_min);` and `atomicMax(max_val, local_max);` atomically update the global minimum and maximum values (`min_val` and `max_val`) with the local minimum and maximum found by each thread.

# `main()` Function:
**1. Array Initialization and Allocation:**
* Allocates memory for the array `arr`.
* Initializes the array with random values and prints its elements.

**2. Device Memory Allocation and Copying:**
* Allocates memory on the GPU for `d_arr` (array data), `d_min`, and `d_max`.
* Copies the array data from the host to the device.

**3. Initialize Min and Max Values:**
* Initializes `min_val` and `max_val` variables with the first element of the array.
* Copies these initial values to device memory (`d_min` and `d_max`).

**4. Kernel Launch:**
* Calculates the number of blocks needed based on the array size and launches the `min_max_kernel` with the configured block size.
* Each block processes a set of array elements to find local minimum and maximum values.

**5. Copy Results Back to Host:**
* Copies the updated minimum and maximum values from the device to the host.

**6. Display Minimum and Maximum Values:**
* Prints the minimum and maximum values found in the array.

**7. Memory Cleanup:**
* Frees the allocated memory on both the host and the device.



In summary, this CUDA code employs parallel computation to swiftly identify the minimum and maximum values within an array using GPU-based kernels. By allocating threads to calculate local minimum and maximum values across the array segments and merging these values using atomic operations, the code efficiently determines the overall minimum and maximum. This implementation optimizes the computation by leveraging GPU parallelism, enhancing the speed of finding the array's extremes.







In [None]:
%%writefile minmax.cu

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

__global__ void min_max_kernel(int* arr, int n, int* min_val, int* max_val) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int local_min = arr[0];
    int local_max = arr[0];

    for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
        if (arr[i] < local_min) {
            local_min = arr[i];
        }
        if (arr[i] > local_max) {
            local_max = arr[i];
        }
    }

    atomicMin(min_val, local_min);
    atomicMax(max_val, local_max);
}

int main() {
    int n = 10; // array size
    int *arr, *d_arr, *d_min, *d_max;
    int min_val, max_val;

    // Allocate memory for the array
    arr = (int*)malloc(n * sizeof(int));

    // Initialize array with random values
    printf("Inital Array\n");
    for (int i = 0; i < n; ++i) {
        arr[i] = rand() % 1000;
        printf("elem%d = %d, ",i,arr[i]);
    }
    printf("\n");

    // Allocate memory on device
    cudaMalloc((void**)&d_arr, n * sizeof(int));
    cudaMalloc((void**)&d_min, sizeof(int));
    cudaMalloc((void**)&d_max, sizeof(int));

    // Copy array to device
    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);

    // Initialize min and max values
    min_val = arr[0];
    max_val = arr[0];
    cudaMemcpy(d_min, &min_val, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_max, &max_val, sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    int num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
    min_max_kernel<<<num_blocks, BLOCK_SIZE>>>(d_arr, n, d_min, d_max);

    // Copy result back to host
    cudaMemcpy(&min_val, d_min, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&max_val, d_max, sizeof(int), cudaMemcpyDeviceToHost);

    printf("Minimum value: %d\n", min_val);
    printf("Maximum value: %d\n", max_val);

    // Free device memory
    cudaFree(d_arr);
    cudaFree(d_min);
    cudaFree(d_max);

    // Free host memory
    free(arr);

    return 0;
}


Overwriting minmax.cu


In [None]:
!nvcc minmax.cu -o minmax

In [None]:
!./minmax

Inital Array
elem0 = 383, elem1 = 886, elem2 = 777, elem3 = 915, elem4 = 793, elem5 = 335, elem6 = 386, elem7 = 492, elem8 = 649, elem9 = 421, 
Minimum value: 335
Maximum value: 915


# This code calculates the summation of all elements in an array using CUDA.
# Kernel `sum_reduce_kernel`:
**1. Shared Memory Allocation:**
* `extern __shared__ int shared[];` allocates shared memory on the GPU.

**2. Thread Index Calculation:**
* `int tid = threadIdx.x;` obtains the thread ID within the block.
* `int idx = blockIdx.x * blockDim.x + threadIdx.x;` calculates the global index of the thread.

**3. Loading Data into Shared Memory:**
* `if (idx < n)` checks if the thread index is within the array bounds.
* Each thread loads an element of the array into shared memory (`shared[tid] = arr[idx];`). If the thread is out of bounds, it loads 0.

**4. Synchronization:**
* `__syncthreads();` ensures all threads have loaded their data into shared memory before proceeding.

**5. Parallel Reduction:**
* The kernel performs a parallel reduction to find the sum of the array elements using shared memory.
* It iteratively adds elements by reducing the stride in half until reaching 0.
* Threads with `tid < stride` update their values by adding the value at `tid + stride` to their own value.
* `__syncthreads();` ensures synchronization after each iteration.

**6. Storing Partial Results:**
* Once the reduction completes (`tid == 0`), each block stores its partial sum (`shared[0]`) into the `result` array at the corresponding block index.

# `main()` Function:
**1. Array Initialization and Allocation:**
* Allocates memory for the array `arr`.
* Initializes the array with random values and prints its elements.

**2. Device Memory Allocation and Copying:**
* Allocates memory on the GPU (`d_arr` for the array data, `d_result` for the final result).
* Copies the array data from the host to the device.

**3. Kernel Configuration:**
* Calculates the grid and block sizes based on the array size (`n`) and the defined `BLOCK_SIZE`.

**4. Temporary Memory Allocation:**
* Allocates temporary memory (`temp_result`) on the GPU to store partial results.

**5. Kernel Launch:**
* Launches the kernel (`sum_reduce_kernel`) with the configured grid and block sizes to perform parallel reduction on the GPU.

**6. Copying Partial Results to CPU:**
* Copies the partial results from the device to the host.

**7. Final Reduction on CPU:**
* Performs the final reduction on the CPU by summing up the partial results obtained from each block.

**8. Printing the Sum:**
* Displays the sum of the array elements computed on the CPU.

**9. Memory Cleanup:**
* Frees the allocated memory on both the host and the device.


In summary, this code utilizes CUDA to perform parallel reduction, efficiently summing the elements of an array. It uses GPU kernels to distribute workload across threads and blocks, optimizing computation via parallelism. The main() function initializes the array, transfers data to the GPU, launches the reduction kernel, retrieves and calculates the final sum on the CPU, showcasing CUDA's capacity for accelerating array summation tasks.







In [None]:
%%writefile sum.cu

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256

__global__ void sum_reduce_kernel(int* arr, int n, int* result) {
    extern __shared__ int shared[];

    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        shared[tid] = arr[idx];
    } else {
        shared[tid] = 0;
    }
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            shared[tid] += shared[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) {
        result[blockIdx.x] = shared[0];
    }
}

int main() {
    int n = 10; // array size
    int *arr, *d_arr, *d_result;
    int result_cpu = 0;

    // Allocate memory for the array
    arr = (int*)malloc(n * sizeof(int));

    // Initialize array with random values
    printf("Inital Array\n");
    for (int i = 0; i < n; ++i) {
        arr[i] = rand() % 1000;
        printf("elem%d = %d, ",i,arr[i]);
    }
    printf("\n");

    // Allocate memory on device
    cudaMalloc((void**)&d_arr, n * sizeof(int));
    cudaMalloc((void**)&d_result, sizeof(int));

    // Copy array to device
    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);

    // Calculate grid and block sizes
    int block_size = BLOCK_SIZE;
    int grid_size = (n + block_size - 1) / block_size;

    // Allocate temporary storage for partial sums
    int* temp_result;
    cudaMalloc((void**)&temp_result, grid_size * sizeof(int));

    // Launch kernel for sum reduction
    sum_reduce_kernel<<<grid_size, block_size, block_size * sizeof(int)>>>(d_arr, n, temp_result);

    // Reduce the partial sums on the CPU
    int* partial_result = (int*)malloc(grid_size * sizeof(int));
    cudaMemcpy(partial_result, temp_result, grid_size * sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < grid_size; ++i) {
        result_cpu += partial_result[i];
    }

    printf("Sum of array elements : %d\n", result_cpu);

    // Free memory
    cudaFree(d_arr);
    cudaFree(d_result);
    cudaFree(temp_result);
    free(arr);
    free(partial_result);

    return 0;
}


Overwriting sum.cu


In [None]:
!nvcc sum.cu -o sum

In [None]:
!./sum

Inital Array
elem0 = 383, elem1 = 886, elem2 = 777, elem3 = 915, elem4 = 793, elem5 = 335, elem6 = 386, elem7 = 492, elem8 = 649, elem9 = 421, 
Sum of array elements : 6037


# This code performs matrix multiplication of two 4x4 matrices (A and B) using CUDA.

**1. Matrix Initialization:**
* Two 4x4 matrices, A and B, are initialized in the main() function.

**2. Memory Allocation and Copying to Device:**
* Memory is allocated on the GPU for matrices d_A, d_B, and d_C using cudaMalloc.
* The contents of matrices A and B are copied from the host (CPU) to the device (GPU) using cudaMemcpy.

**3. Kernel Definition (matrix_multiply):**
* The matrix_multiply kernel is launched on the GPU to perform the matrix multiplication.
* Each thread calculates a single element of the resulting matrix C.
* blockIdx and threadIdx are used to calculate the row and column indices for the element that each thread will compute.
* A loop iterates through the elements of A and B to compute the product and accumulate the sum for the resulting matrix C.

**4. Kernel Launch Configuration:**
* dim3 structures, threadsPerBlock and blocksPerGrid, are defined to configure the number of threads per block and the number of blocks per grid for kernel execution.

**5. Memory Copy Back to Host:**

* The resulting matrix C is copied from the device back to the host (CPU) using cudaMemcpy.

**6. Printing Matrices:**

* The matrices A, B, and the resulting matrix C are printed to the console to display their contents.

**7. Memory Cleanup:**

* Memory allocated on the GPU is freed using cudaFree.

**8. Return:**

* The main() function returns 0, indicating successful execution.

In summary, this CUDA C code initializes two 4x4 matrices, performs matrix multiplication on the GPU using a kernel function, retrieves the result back to the CPU, and displays the matrices to the console. Adjustments to the matrix values and sizes can be made to suit specific requirements.

In [None]:
%%writefile matrixmul.cu

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define SIZE 4

__global__ void matrix_multiply(int* A, int* B, int* C) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int sum = 0;
    for (int k = 0; k < SIZE; ++k) {
        sum += A[row * SIZE + k] * B[k * SIZE + col];
    }

    C[row * SIZE + col] = sum;
}

int main() {
    int A[SIZE][SIZE] = {{1, 2, 3, 4},
                         {5, 6, 7, 8},
                         {9, 10, 11, 12},
                         {13, 14, 15, 16}};

    int B[SIZE][SIZE] = {{1, 0, 0, 1},
                         {0, 1, 1, 0},
                         {0, 1, 1, 0},
                         {1, 0, 0, 1}};

    int C[SIZE][SIZE];

    int* d_A, *d_B, *d_C;

    cudaMalloc((void**)&d_A, SIZE * SIZE * sizeof(int));
    cudaMalloc((void**)&d_B, SIZE * SIZE * sizeof(int));
    cudaMalloc((void**)&d_C, SIZE * SIZE * sizeof(int));

    cudaMemcpy(d_A, A, SIZE * SIZE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, SIZE * SIZE * sizeof(int), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(SIZE, SIZE);
    dim3 blocksPerGrid(1, 1);

    matrix_multiply<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);

    cudaMemcpy(C, d_C, SIZE * SIZE * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Matrix A:\n");
    for (int i = 0; i < SIZE; ++i) {
        for (int j = 0; j < SIZE; ++j) {
            printf("%d ", A[i][j]);
        }
        printf("\n");
    }

    printf("\nMatrix B:\n");
    for (int i = 0; i < SIZE; ++i) {
        for (int j = 0; j < SIZE; ++j) {
            printf("%d ", B[i][j]);
        }
        printf("\n");
    }

    printf("\nResultant Matrix C:\n");
    for (int i = 0; i < SIZE; ++i) {
        for (int j = 0; j < SIZE; ++j) {
            printf("%d ", C[i][j]);
        }
        printf("\n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Overwriting matrixmul.cu


In [None]:
!nvcc matrixmul.cu -o matrixmul

In [None]:
!./matrixmul

Matrix A:
1 2 3 4 
5 6 7 8 
9 10 11 12 
13 14 15 16 

Matrix B:
1 0 0 1 
0 1 1 0 
0 1 1 0 
1 0 0 1 

Resultant Matrix C:
5 5 5 5 
13 13 13 13 
21 21 21 21 
29 29 29 29 
