In [None]:
%%writefile hello.cu

#include <stdio.h>



__global__ void helloFromGPU() {

    int threadId = threadIdx.x;

    int blockId  = blockIdx.x;

    int globalId = blockIdx.x * blockDim.x + threadIdx.x;



    printf("Hello from block %d, thread %d (global %d)\n", blockId, threadId, globalId);

}



int main() {

    helloFromGPU<<<2, 4>>>();

    cudaDeviceSynchronize();

    return 0;

}



Writing hello.cu


In [None]:
%%writefile vector_add.cu

#include <stdio.h>

#include <cuda.h>

#include <chrono>

#include <iostream>



using namespace std;



__global__ void vectorAdd(float *A, float *B, float *C, int n) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n) {

        C[i] = A[i] + B[i];

    }

}



int main() {

    int n = 10000000;  // 10 million

    size_t size = n * sizeof(float);





    float *h_A = new float[n];

    float *h_B = new float[n];

    float *h_C = new float[n];



    for (int i = 0; i < n; i++) {

        h_A[i] = 1.0f;

        h_B[i] = 2.0f;

    }


    auto start_cpu = chrono::high_resolution_clock::now();

    for (int i = 0; i < n; i++) {

        h_C[i] = h_A[i] + h_B[i];

    }

    auto end_cpu = chrono::high_resolution_clock::now();

    double cpu_time = chrono::duration<double>(end_cpu - start_cpu).count();



    float *d_A, *d_B, *d_C;

    cudaMalloc((void**)&d_A, size);

    cudaMalloc((void**)&d_B, size);

    cudaMalloc((void**)&d_C, size);



    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);



    int threadsPerBlock = 256;

    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;



    auto start_gpu = chrono::high_resolution_clock::now();

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, n);

    cudaDeviceSynchronize();

    auto end_gpu = chrono::high_resolution_clock::now();

    double gpu_time = chrono::duration<double>(end_gpu - start_gpu).count();



    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);



    printf("CPU time = %f sec\n", cpu_time);

    printf("GPU time = %f sec\n", gpu_time);

    printf("Speedup = %f\n", cpu_time / gpu_time);



    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    delete[] h_A; delete[] h_B; delete[] h_C;



    return 0;

}


Writing vector_add.cu


In [None]:
%%writefile image_invert.cu
#include <opencv2/opencv.hpp>
#include <iostream>
#include <cuda_runtime.h>

using namespace cv;
using namespace std;

__global__ void invertImage(unsigned char* input, unsigned char* output, int width, int height, int channels) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        int idx = (y * width + x) * channels;
        for (int c = 0; c < channels; c++) {
            output[idx + c] = 255 - input[idx + c];
        }
    }
}

int main() {
    Mat input = imread("sample.jpg");
    if (input.empty()) {
        cout << "Error: Could not open image!" << endl;
        return -1;
    }

    int width = input.cols;
    int height = input.rows;
    int channels = input.channels();
    int img_size = width * height * channels;

    unsigned char* h_input = input.data;
    unsigned char* h_output = new unsigned char[img_size];

    // CPU inversion
    for (int i = 0; i < img_size; i++) {
        h_output[i] = 255 - h_input[i];
    }
    Mat cpu_img(height, width, CV_8UC3, h_output);
    imwrite("output_cpu.jpg", cpu_img.clone());

    // GPU inversion
    unsigned char *d_input, *d_output;
    cudaMalloc((void**)&d_input, img_size);
    cudaMalloc((void**)&d_output, img_size);
    cudaMemcpy(d_input, h_input, img_size, cudaMemcpyHostToDevice);

    dim3 block(16, 16);
    dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
    invertImage<<<grid, block>>>(d_input, d_output, width, height, channels);
    cudaDeviceSynchronize();

    cudaMemcpy(h_output, d_output, img_size, cudaMemcpyDeviceToHost);
    Mat gpu_img(height, width, CV_8UC3, h_output);
    imwrite("output_gpu.jpg", gpu_img.clone());

    cudaFree(d_input);
    cudaFree(d_output);
    delete[] h_output;

    cout << "Done: output_cpu.jpg and output_gpu.jpg generated" << endl;
    return 0;
}


Writing image_invert.cu


In [5]:
import cv2
import matplotlib.pyplot as plt

# Load images
original_img = cv2.imread("sample.jpg")
cpu_img = cv2.imread("output_cpu.jpg")
gpu_img = cv2.imread("output_gpu.jpg")

# Convert BGR to RGB for proper color display
original_rgb = cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB)
cpu_rgb = cv2.cvtColor(cpu_img, cv2.COLOR_BGR2RGB)
gpu_rgb = cv2.cvtColor(gpu_img, cv2.COLOR_BGR2RGB)

# Display side-by-side
plt.figure(figsize=(18,6))

plt.subplot(1, 3, 1)
plt.imshow(original_rgb)
plt.title("Original Image")
plt.axis("off")

plt.subplot(1, 3, 2)
plt.imshow(cpu_rgb)
plt.title("CPU Inverted Image")
plt.axis("off")

plt.subplot(1, 3, 3)
plt.imshow(gpu_rgb)
plt.title("GPU Inverted Image")
plt.axis("off")

plt.show()


error: OpenCV(4.12.0) /io/opencv/modules/imgproc/src/color.cpp:199: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'
