<a href="https://colab.research.google.com/github/RanjanRaj07/CUDAProgramming/blob/main/cudacode1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#check for cuda compiler.
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [1]:
#install nvcc plugin for jupter notebook
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-du4cecyb
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-du4cecyb
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=98df95280b0229a6f340d47ad59e486471064197ced4e8c3483f2f13411ba9e3
  Stored in directory: /tmp/pip-ephem-wheel-cache-l9q_uk4l/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
#load the installed nvcc plugin for notebook
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [3]:
# a test program
%%cu
#include <iostream>
	int main()
{
	std::cout << "welcome to cude code\n";
	return 0;
}

welcome to cude code



In [4]:
# GPU CUDA program for adding 3000000 items
%%cu
#include <iostream>
#include <cmath>
#include <chrono>

// CUDA kernel to perform vector addition
__global__ void vectorAdd(float* A, float* B, float* C, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    int size = 3000000;
    int numBytes = size * sizeof(float);

    // Allocate memory on host
    float* h_A = new float[size];
    float* h_B = new float[size];
    float* h_C = new float[size];

    // Initialize input arrays
    for (int i = 0; i < size; ++i) {
        h_A[i] = static_cast<float>(i);
        h_B[i] = static_cast<float>(2 * i);
    }

    // Allocate memory on device
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, numBytes);
    cudaMalloc(&d_B, numBytes);
    cudaMalloc(&d_C, numBytes);

    // Transfer input data from host to device
    cudaMemcpy(d_A, h_A, numBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, numBytes, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    int blockSize = 256;
    int gridSize = (size + blockSize - 1) / blockSize;

    // Measure execution time using chrono
    auto start = std::chrono::high_resolution_clock::now();

    // Launch the kernel
    vectorAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, size);

    // Synchronize to ensure the kernel finishes
    cudaDeviceSynchronize();

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed = end - start;

    // Transfer results from device to host
    cudaMemcpy(h_C, d_C, numBytes, cudaMemcpyDeviceToHost);

    // Print a few results for verification
    for (int i = 0; i < 10; ++i) {
        std::cout << h_A[i] << " + " << h_B[i] << " = " << h_C[i] << std::endl;
    }

    std::cout << "Time taken for execution: " << elapsed.count() << " seconds" << std::endl;

    // Clean up host memory
    delete[] h_A;
    delete[] h_B;
    delete[] h_C;

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


0 + 0 = 0
1 + 2 = 3
2 + 4 = 6
3 + 6 = 9
4 + 8 = 12
5 + 10 = 15
6 + 12 = 18
7 + 14 = 21
8 + 16 = 24
9 + 18 = 27
Time taken for execution: 0.000185186 seconds



***3,00,0000*** items processed in less than a milli second

---
***POWER OF GPU***
