In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [3]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-sxpz4noh
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-sxpz4noh
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=12d9fb1fc4927703229552f917dc0366b42b79c08e7239bd7c1f5175e6b32ed0
  Stored in directory: /tmp/pip-ephem-wheel-cache-n2sohj1h/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

Vector Addition

In [8]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [21]:
%%cu
#include <stdio.h>

#define N 5// Size of the vectors

// CUDA kernel to add two vectors
__global__
void vectorAdd(int *a, int *b, int *c)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < N)
    {
        c[tid] = a[tid] + b[tid];
    }
}

int main()
{
    int *a, *b, *c; // Host vectors
    int *d_a, *d_b, *d_c; // Device vectors

    // Allocate memory for the host vectors
    a = (int*)malloc(N * sizeof(int));
    b = (int*)malloc(N * sizeof(int));
    c = (int*)malloc(N * sizeof(int));

    // Initialize the host vectors
    for (int i = 0; i < N; i++)
    {
        a[i] = i;
        b[i] = 2 * i;
    }

    // Allocate memory for the device vectors
    cudaMalloc(&d_a, N * sizeof(int));
    cudaMalloc(&d_b, N * sizeof(int));
    cudaMalloc(&d_c, N * sizeof(int));

    // Copy the host vectors to device
    cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    // Perform the vector addition on the GPU
    int blockSize = 256;
    int gridSize = (N + blockSize - 1) / blockSize;
    vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c);

    // Copy the result back to host
    cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Display the vectors and result
    printf("Vector A:\n");
    for (int i = 0; i < N; i++)
    {
        printf("%d ", a[i]);
    }
    printf("\n\n");

    printf("Vector B:\n");
    for (int i = 0; i < N; i++)
    {
        printf("%d ", b[i]);
    }
    printf("\n\n");

    printf("Resultant Vector C:\n");
    for (int i = 0; i < N; i++)
    {
        printf("%d ", c[i]);
    }
    printf("\n");

    // Cleanup
    free(a);
    free(b);
    free(c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}



Vector A:
0 1 2 3 4 

Vector B:
0 2 4 6 8 

Resultant Vector C:
0 3 6 9 12 



Matrix Multiplication

In [17]:
%%cu
#include <iostream>
#include <cuda.h>

// Matrix multiplication kernel
__global__ void matrixMultiplication(float* A, float* B, float* C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        float sum = 0.0f;
        for (int i = 0; i < N; ++i) {
            sum += A[row * N + i] * B[i * N + col];
        }
        C[row * N + col] = sum;
    }
}

int main() {
    int N = 2; // Matrix size
    int size = N * N * sizeof(float);
    dim3 blockSize(16, 16); // Threads per block
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y); // Blocks per grid

    // Allocate memory on the host
    float* h_A = (float*)malloc(size);
    float* h_B = (float*)malloc(size);
    float* h_C = (float*)malloc(size);

    // Initialize input matrices
    for (int i = 0; i < N * N; ++i) {
        h_A[i] = i;
        h_B[i] = i;
    }

    // Print input matrices
    std::cout << "Matrix A:" << std::endl;
    for (int i = 0; i < N * N; ++i) {
        std::cout << h_A[i] << " ";
        if ((i + 1) % N == 0) {
            std::cout << std::endl;
        }
    }
    std::cout << std::endl;

    std::cout << "Matrix B:" << std::endl;
    for (int i = 0; i < N * N; ++i) {
        std::cout << h_B[i] << " ";
        if ((i + 1) % N == 0) {
            std::cout << std::endl;
        }
    }
    std::cout << std::endl;

    // Allocate memory on the device
    float* d_A, * d_B, * d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy input matrices from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Launch the matrix multiplication kernel
    matrixMultiplication<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);

    // Copy the result matrix from device to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Print the result matrix
    std::cout << "Result Matrix C:" << std::endl;
    for (int i = 0; i < N * N; ++i) {
        std::cout << h_C[i] << " ";
        if ((i + 1) % N == 0) {
            std::cout << std::endl;
        }
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free host memory
    free(h_A);
   
}


Matrix A:
0 1 
2 3 

Matrix B:
0 1 
2 3 

Result Matrix C:
2 3 
6 11 

