In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [4]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-7zfj3m_r
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-7zfj3m_r
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10741 sha256=1c43b610d84440f376c57bb0b3d20f87e2433fdb6f06eb98d5fd7c81c461a971
  Stored in directory: /tmp/pip-ephem-wheel-cache-reqik072/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [5]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp2lf35mr4".


In [18]:
%%cuda

#include <iostream>

// Size of array
#define N 1048576

using namespace std;

// Kernel
__global__ void add_vectors(double *a, double *b, double *c)
{
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if(id < N) c[id] = a[id] + b[id];
}

// Main program
int main()
{
    // Number of bytes to allocate for N doubles
    size_t bytes = N*sizeof(double);

    // Allocate memory for arrays A, B, and C on host
    double *A = new double[N];
    double *B = new double[N];
    double *C = new double[N];

    // Allocate memory for arrays d_A, d_B, and d_C on device
    double *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    // Fill host arrays A and B
    for(int i=0; i<N; i++)
    {
        A[i] = 1.0;
        B[i] = 2.0;
    }

    // Copy data from host arrays A and B to device arrays d_A and d_B
    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);

    // Set execution configuration parameters
    //      thr_per_blk: number of CUDA threads per grid block
    //      blk_in_grid: number of blocks in grid
    int thr_per_blk = 256;
    int blk_in_grid = ceil( float(N) / thr_per_blk );

    // Launch kernel
    add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C);

    // Copy data from device array d_C to host array C
    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

    // Verify results
    double tolerance = 1.0e-14;
    for(int i=0; i<N; i++)
    {
        if( abs(C[i] - 3.0) > tolerance)
        {
            cout << "Error: value of C[" << i << "] = " << C[i] << " instead of 3.0" << endl;
            exit(1);
        }
    }

    // Free memory
    delete[] A;
    delete[] B;
    delete[] C;
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    cout << "---------------------------" << endl;
    cout << "__SUCCESS__" << endl;
    cout << "---------------------------" << endl;
    cout << "N                 = " << N << endl;
    cout << "Threads Per Block = " << thr_per_blk << endl;
    cout << "Blocks In Grid    = " << blk_in_grid << endl;
    cout << "---------------------------" << endl << endl;

    return 0;
}

---------------------------
__SUCCESS__
---------------------------
N                 = 1048576
Threads Per Block = 256
Blocks In Grid    = 4096
---------------------------




In [32]:
%%cuda

#include <iostream>
void matmul(int *A, int *B, int *C, int N)
{
    for (int Row = 0; Row < N; Row++)
    {
        for (int Col = 0; Col < N; Col++)
        {
            int Pvalue = 0;
            for (int k = 0; k < N; k++)
            {
                Pvalue += A[Row * N + k] * B[k * N + Col];
            }
            C[Row * N + Col] = Pvalue;
        }
    }
}

int main()
{
    int N = 512;
    int size = N * N * sizeof(int);
    int *A, *B, *C;
    A = new int[size];
    B = new int[size];
    C = new int[size];

    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            A[i * N + j] = i * N + j;
            B[i * N + j] = j * N + i;
        }
    }

    matmul(A, B, C, N);

    for (int i = 0; i < 10; i++)
    {
        for (int j = 0; j < 10; j++)
        {
            std::cout << C[i * N + j] << " ";
        }
        std::cout << std::endl;
    }

    delete[] A;
    delete[] B;
    delete[] C;
    return 0;
}


44608256 111586048 178563840 245541632 312519424 379497216 446475008 513452800 580430592 647408384 
111586048 312781568 513977088 715172608 916368128 1117563648 1318759168 1519954688 1721150208 1922345728 
178563840 513977088 849390336 1184803584 1520216832 1855630080 -2103923968 -1768510720 -1433097472 -1097684224 
245541632 715172608 1184803584 1654434560 2124065536 -1701270784 -1231639808 -762008832 -292377856 177253120 
312519424 916368128 1520216832 2124065536 -1567053056 -963204352 -359355648 244493056 848341760 1452190464 
379497216 1117563648 1855630080 -1701270784 -963204352 -225137920 512928512 1250994944 1989061376 -1567839488 
446475008 1318759168 -2103923968 -1231639808 -359355648 512928512 1385212672 -2037470464 -1165186304 -292902144 
513452800 1519954688 -1768510720 -762008832 244493056 1250994944 -2037470464 -1030968576 -24466688 982035200 
580430592 1721150208 -1433097472 -292377856 848341760 1989061376 -1165186304 -24466688 1116252928 -2037994752 
647408384 192234572

In [None]:

# Certainly! Let's break down the provided code line by line:

# cpp
# Copy code
# #include <iostream>
# This line includes the iostream header file, which allows input and output operations, such as printing to the console.

# cpp
# Copy code
# // Size of array
# #define N 1048576
# This line defines a constant N with a value of 1048576, representing the size of arrays used in the code.

# cpp
# Copy code
# using namespace std;
# This line declares that names from the standard C++ library (like cout and endl) will be used without explicitly specifying their namespace.

# cpp
# Copy code
# // Kernel
# __global__ void add_vectors(double *a, double *b, double *c)
# This line defines a CUDA kernel function named add_vectors. Kernels are functions that are executed on the GPU. This particular kernel is meant to add two arrays a and b element-wise and store the result in array c.

# cpp
# Copy code
# {
#     int id = blockDim.x * blockIdx.x + threadIdx.x;
#     if(id < N) c[id] = a[id] + b[id];
# }
# Within the kernel function, this block of code calculates the unique index (id) of each thread based on its block and thread indices. It then checks if this index is within the range of the array size N, and if so, it performs the addition of corresponding elements from arrays a and b, storing the result in array c.

# cpp
# Copy code
# // Main program
# int main()
# This line declares the main function of the program.

# cpp
# Copy code
# {
#     // Number of bytes to allocate for N doubles
#     size_t bytes = N*sizeof(double);
# This line calculates the total number of bytes needed to store N double precision floating-point numbers.

# cpp
# Copy code
#     // Allocate memory for arrays A, B, and C on host
#     double *A = new double[N];
#     double *B = new double[N];
#     double *C = new double[N];
# This line dynamically allocates memory on the host (CPU) for three arrays A, B, and C, each containing N double precision floating-point numbers.

# cpp
# Copy code
#     // Allocate memory for arrays d_A, d_B, and d_C on device
#     double *d_A, *d_B, *d_C;
#     cudaMalloc(&d_A, bytes);
#     cudaMalloc(&d_B, bytes);
#     cudaMalloc(&d_C, bytes);
# These lines allocate memory on the device (GPU) for three arrays d_A, d_B, and d_C to store the input and output data for the CUDA kernel. cudaMalloc is a CUDA function used to allocate memory on the GPU.

# cpp
# Copy code
#     // Fill host arrays A and B
#     for(int i=0; i<N; i++)
#     {
#         A[i] = 1.0;
#         B[i] = 2.0;
#     }
# This loop fills the host arrays A and B with the values 1.0 and 2.0, respectively.

# cpp
# Copy code
#     // Copy data from host arrays A and B to device arrays d_A and d_B
#     cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
#     cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);
# These lines copy the data from the host arrays A and B to the corresponding device arrays d_A and d_B using cudaMemcpy. This function is used to transfer data between the CPU and GPU.

# The code continues similarly, executing a CUDA kernel (add_vectors), copying the result back to the host (C), verifying the result, freeing memory, and printing some information about the execution configuration.

# The second part of the code implements a matrix multiplication (matmul) function in C++. It multiplies two matrices A and B to produce a result matrix C. The matrices are filled with values and then multiplied using nested loops. Finally, the result matrix C is printed to the console.