In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [4]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-ics0y6do
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-ics0y6do
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4294 sha256=e453609e11f3da433a4f3601ddae29df66dc7fa0a2770f5110b0e50a39860525
  Stored in directory: /tmp/pip-ephem-wheel-cache-sviu_tkf/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [5]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [8]:
!apt-get -qq install -y libopencv-dev


In [9]:
!pkg-config --cflags opencv4


-I/usr/include/opencv4


In [None]:
!nvcc -o main your_cuda_code.cu -I/usr/include/opencv4


In [15]:
# Install OpenCV
!apt-get -qq install -y libopencv-dev

# Create a simple CUDA program (main.cu)
cuda_code = """
#include <iostream>
#include <cmath>
#include <ctime>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <functional>
// Define image dimensions
const int imageWidth = 1024;
const int imageHeight = 1024;

// Define the number of threads per block
const int threadsPerBlock = 256;

// Define the size and shape of the traffic signal template
const int templateWidth = 50;
const int templateHeight = 50;
void serialHoughTransform(const float* inputImage, int* accumulator);
__global__ void parallelHoughTransform(const float* inputImage, int* accumulator);

// Function to perform the Generalized Hough Transform in serial
void serialHoughTransform(const float* inputImage, int* accumulator) {
    // Dummy code for demonstration purposes
    for (int y = 0; y < imageHeight; ++y) {
        for (int x = 0; x < imageWidth; ++x) {
            if (inputImage[y * imageWidth + x] > 0) {
                // Loop over possible template positions
                for (int ty = 0; ty < templateHeight; ++ty) {
                    for (int tx = 0; tx < templateWidth; ++tx) {
                        int dy = y - ty;
                        int dx = x - tx;

                        // Check if the template position is within bounds
                        if (dy >= 0 && dy < imageHeight && dx >= 0 && dx < imageWidth) {
                            // Vote in the accumulator space
                            accumulator[dy * imageWidth + dx] += 1;
                        }
                    }
                }
            }
        }
    }
}

// CUDA kernel to perform the Generalized Hough Transform in parallel
__global__ void parallelHoughTransform(const float* inputImage, int* accumulator) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int y = index / imageWidth;
    int x = index % imageWidth;

    if (y < imageHeight && x < imageWidth && inputImage[y * imageWidth + x] > 0) {
        // Loop over possible template positions
        for (int ty = 0; ty < templateHeight; ++ty) {
            for (int tx = 0; tx < templateWidth; ++tx) {
                int dy = y - ty;
                int dx = x - tx;

                // Check if the template position is within bounds
                if (dy >= 0 && dy < imageHeight && dx >= 0 && dx < imageWidth) {
                    // Vote in the accumulator space
                    atomicAdd(&accumulator[dy * imageWidth + dx], 1);
                }
            }
        }
    }
}

int main() {
    // Load the input image
    cv::Mat inputImageMat = cv::imread("sample.jpg", cv::IMREAD_GRAYSCALE);

    // Check if the image is loaded successfully
    if (inputImageMat.empty()) {
        std::cerr << "Error loading the input image." << std::endl;
        return -1;
    }

    // Convert the input image to a float array (assuming it's a single-channel image)
    float* inputImage = new float[imageWidth * imageHeight];
    for (int y = 0; y < imageHeight; ++y) {
        for (int x = 0; x < imageWidth; ++x) {
            inputImage[y * imageWidth + x] = static_cast<float>(inputImageMat.at<uchar>(y, x));
        }
    }

    // Allocate memory for accumulator arrays
    int* serialAccumulator = new int[imageWidth * imageHeight];
    int* parallelAccumulator = new int[imageWidth * imageHeight];

    // Serial Generalized Hough Transform
    clock_t serialStart = clock();
    serialHoughTransform(inputImage, serialAccumulator);
    clock_t serialEnd = clock();

    // Parallel Generalized Hough Transform
    float* d_inputImage;
    int* d_parallelAccumulator;

    cudaMalloc((void**)&d_inputImage, imageWidth * imageHeight * sizeof(float));
    cudaMalloc((void**)&d_parallelAccumulator, imageWidth * imageHeight * sizeof(int));

    cudaMemcpy(d_inputImage, inputImage, imageWidth * imageHeight * sizeof(float), cudaMemcpyHostToDevice);

    dim3 blocks(imageWidth * imageHeight / threadsPerBlock, 1, 1);
    dim3 threads(threadsPerBlock, 1, 1);

    clock_t parallelStart = clock();
    parallelHoughTransform<<<blocks, threads>>>(d_inputImage, d_parallelAccumulator);
    cudaDeviceSynchronize();
    clock_t parallelEnd = clock();

    cudaMemcpy(parallelAccumulator, d_parallelAccumulator, imageWidth * imageHeight * sizeof(int), cudaMemcpyDeviceToHost);

    // Calculate and print execution times
    double serialTime = double(serialEnd - serialStart) / CLOCKS_PER_SEC;
    double parallelTime = double(parallelEnd - parallelStart) / CLOCKS_PER_SEC;

    std::cout << "Serial Execution Time: " << serialTime << " seconds" << std::endl;
    std::cout << "Parallel Execution Time: " << parallelTime << " seconds" << std::endl;



    // Free allocated memory
    delete[] inputImage;
    delete[] serialAccumulator;
    delete[] parallelAccumulator;
    cudaFree(d_inputImage);
    cudaFree(d_parallelAccumulator);

    return 0;
}
"""

# Write the CUDA code to a file
with open("main.cu", "w") as file:
    file.write(cuda_code)

# Compile the CUDA code
!nvcc -o main main.cu -I/usr/include/opencv4 -L/usr/lib -lopencv_core -lopencv_highgui -lopencv_imgcodecs


# Run the compiled program
!./main










