In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-b1c7hdyz
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-b1c7hdyz
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [3]:
%load_ext nvcc_plugin

directory /content/src already exists
Out bin /content/result.out


In [4]:
%%writefile operation.cu

#include <iostream>
#include <cuda_runtime.h>
#include <algorithm> // For min and max operations
#include <vector>

using namespace std;

// CUDA device function for reduction to find minimum value
__global__ void min_reduction(int* d_arr, int* d_result, int size) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int global_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if (global_id < size) {
        sdata[tid] = d_arr[global_id];
    } else {
        sdata[tid] = INT_MAX;
    }
    __syncthreads();

    // Reduction to find minimum
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && global_id + stride < size) {
            sdata[tid] = min(sdata[tid], sdata[tid + stride]);
        }
        __syncthreads();
    }

    // Store the result from the first thread in each block
    if (tid == 0) {
        d_result[blockIdx.x] = sdata[0];
    }
}

// CUDA device function for reduction to find maximum value
__global__ void max_reduction(int* d_arr, int* d_result, int size) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int global_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if (global_id < size) {
        sdata[tid] = d_arr[global_id];
    } else {
        sdata[tid] = INT_MIN;
    }
    __syncthreads();

    // Reduction to find maximum
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && global_id + stride < size) {
            sdata[tid] = max(sdata[tid], sdata[tid + stride]);
        }
        __syncthreads();
    }

    // Store the result from the first thread in each block
    if (tid == 0) {
        d_result[blockIdx.x] = sdata[0];
    }
}

// CUDA device function for reduction to find sum
__global__ void sum_reduction(int* d_arr, int* d_result, int size) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int global_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if (global_id < size) {
        sdata[tid] = d_arr[global_id];
    } else {
        sdata[tid] = 0;
    }
    __syncthreads();

    // Reduction to find sum
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && global_id + stride < size) {
            sdata[tid] += sdata[tid + stride];
        }
        __syncthreads();
    }

    // Store the result from the first thread in each block
    if (tid == 0) {
        d_result[blockIdx.x] = sdata[0];
    }
}

// Function to calculate the final reduction on the host
int final_reduction(int* d_result, int size, int (*op)(int, int)) {
    std::vector<int> h_result(size);
    cudaMemcpy(h_result.data(), d_result, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Perform final reduction on the host
    int final_result = h_result[0];
    for (int i = 1; i < size; i++) {
        final_result = op(final_result, h_result[i]);
    }

    return final_result;
}

int main() {
    int n;
    cout << "Enter the number of elements: ";
    cin >> n;

    // Allocate memory on the CPU for the array
    int* arr = new int[n];

    // Input the array elements from the user
    cout << "Enter " << n << " elements:\n";
    for (int i = 0; i < n; ++i) {
        cin >> arr[i];
    }

    // Allocate memory on the GPU
    int* d_arr;
    cudaMalloc(&d_arr, n * sizeof(int));
    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

   // Minimum value
    int* d_min_result;
    cudaMalloc(&d_min_result, gridSize * sizeof(int));
    min_reduction<<<gridSize, blockSize, blockSize * sizeof(int)>>>(d_arr, d_min_result, n);
    int minval = final_reduction(d_min_result, gridSize, [](int a, int b) { return std::min(a, b); });


    // Maximum value
    int* d_max_result;
    cudaMalloc(&d_max_result, gridSize * sizeof(int));
    max_reduction<<<gridSize, blockSize, blockSize * sizeof(int)>>>(d_arr, d_max_result, n);
    int maxval = final_reduction(d_max_result, gridSize, [](int a, int b) { return std::max(a, b); });

    // Summation
    int* d_sum_result;
    cudaMalloc(&d_sum_result, gridSize * sizeof(int));
    sum_reduction<<<gridSize, blockSize, blockSize * sizeof(int)>>>(d_arr, d_sum_result, n);
    int sumval = final_reduction(d_sum_result, gridSize, [](int a, int b) { return a + b; });


    // Calculate average on the host
    double average = (double)sumval / n;

    std::cout << "The minimum value is: " << minval << std::endl;
    std::cout << "The maximum value is: " << maxval << std::endl;
    std::cout << "The summation is: " << sumval << std::endl;
    std::cout << "The average is: " << average << std::endl;

    // Free memory
    delete[] arr;
    cudaFree(d_arr);
    cudaFree(d_min_result);
    cudaFree(d_max_result);
    cudaFree(d_sum_result);

    return 0;
}


Overwriting operation.cu


In [5]:
!nvcc operation.cu -o opt

In [6]:
!./opt

Enter the number of elements: 5
Enter 5 elements:
1
2
3
4
5
The minimum value is: 1
The maximum value is: 5
The summation is: 15
The average is: 3
