In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-0vtidhht
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-0vtidhht
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=9db6faf0cb82abfa3df7a122628df5a10687c975331fd3c4054406a9fa75d450
  Stored in directory: /tmp/pip-ephem-wheel-cache-hcx4kkwv/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [6]:
%%cu
#include <stdio.h>



// CUDA kernel for vector addition
__global__ void vectorAddition(float *a, float *b, float *c, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int size = 1000000;  // Size of the vectors
    int memSize = size * sizeof(float);

    // Allocate memory for host vectors
    float *h_a = (float*)malloc(memSize);
    float *h_b = (float*)malloc(memSize);
    float *h_c = (float*)malloc(memSize);

    // Initialize host vectors
    for (int i = 0; i < size; i++) {
        h_a[i] = i;
        h_b[i] = i * 2;
    }

    // Allocate memory for device vectors
    float *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, memSize);
    cudaMalloc((void**)&d_b, memSize);
    cudaMalloc((void**)&d_c, memSize);

    // Copy host vectors to device
    cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, memSize, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    int blockSize = 256;
    int gridSize = (size + blockSize - 1) / blockSize;

    // Launch the kernel
    vectorAddition<<<gridSize, blockSize>>>(d_a, d_b, d_c, size);

    // Copy result back to host
    cudaMemcpy(h_c, d_c, memSize, cudaMemcpyDeviceToHost);

    // Print the result
    for (int i = 0; i < 10; i++) {
        printf("%f + %f = %f\n", h_a[i], h_b[i], h_c[i]);
    }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

0.000000 + 0.000000 = 0.000000
1.000000 + 2.000000 = 3.000000
2.000000 + 4.000000 = 6.000000
3.000000 + 6.000000 = 9.000000
4.000000 + 8.000000 = 12.000000
5.000000 + 10.000000 = 15.000000
6.000000 + 12.000000 = 18.000000
7.000000 + 14.000000 = 21.000000
8.000000 + 16.000000 = 24.000000
9.000000 + 18.000000 = 27.000000



In [7]:
%%cu
#include <stdio.h>

#define N 1024   // Size of the matrices

// CUDA kernel for matrix multiplication
__global__ void matrixMultiplication(int *a, int *b, int *c) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int sum = 0;
    for (int i = 0; i < N; i++) {
        sum += a[row * N + i] * b[i * N + col];
    }

    c[row * N + col] = sum;
}

int main() {
    int memSize = N * N * sizeof(int);

    // Allocate memory for host matrices
    int *h_a = (int*)malloc(memSize);
    int *h_b = (int*)malloc(memSize);
    int *h_c = (int*)malloc(memSize);

    // Initialize host matrices
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            h_a[i * N + j] = i + j;
            h_b[i * N + j] = i - j;
        }
    }

    // Allocate memory for device matrices
    int *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, memSize);
    cudaMalloc((void**)&d_b, memSize);
    cudaMalloc((void**)&d_c, memSize);

    // Copy host matrices to device
    cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, memSize, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockSize(16, 16);
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y);

    // Launch the kernel
    matrixMultiplication<<<gridSize, blockSize>>>(d_a, d_b, d_c);

    // Copy result back to host
    cudaMemcpy(h_c, d_c, memSize, cudaMemcpyDeviceToHost);

    // Print the result
    for (int i = 0; i < 10; i++) {
        for (int j = 0; j < 10; j++) {
            printf("%d\t", h_c[i * N + j]);
        }
        printf("\n");
    }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

357389824	356866048	356342272	355818496	355294720	354770944	354247168	353723392	353199616	352675840	
357913600	357388800	356864000	356339200	355814400	355289600	354764800	354240000	353715200	353190400	
358437376	357911552	357385728	356859904	356334080	355808256	355282432	354756608	354230784	353704960	
358961152	358434304	357907456	357380608	356853760	356326912	355800064	355273216	354746368	354219520	
359484928	358957056	358429184	357901312	357373440	356845568	356317696	355789824	355261952	354734080	
360008704	359479808	358950912	358422016	357893120	357364224	356835328	356306432	355777536	355248640	
360532480	360002560	359472640	358942720	358412800	357882880	357352960	356823040	356293120	355763200	
361056256	360525312	359994368	359463424	358932480	358401536	357870592	357339648	356808704	356277760	
361580032	361048064	360516096	359984128	359452160	358920192	358388224	357856256	357324288	356792320	
362103808	361570816	361037824	360504832	359971840	359438848	358905856	358372864	357839872	3