In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-up2i43nx
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-up2i43nx
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 781ff5b76ba6c4c2d80dcbbec9983e147613cc71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
%load_ext nvcc4jupyter

Source files will be saved in "/tmp/tmpmop9gnrk".


In [4]:
%%cuda
#include <stdio.h>

__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
}

Hello from block: 1, thread: 0
Hello from block: 1, thread: 1
Hello from block: 0, thread: 0
Hello from block: 0, thread: 1



In [33]:
%%cuda
#include <iostream>
#include <vector>
#include <numeric>
#include <stdio.h>
#include <typeinfo>

__global__ void vectorAdd(float *a, float *b, float *c) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    //printf("%u", a);
    c[i] = a[i] + b[i];
    //printf("%u from %u", c[i], i);
}

int main() {
    dim3 blocksPerGrid(2,1,1);
    dim3 threadsPerBlock(256,1,1);

    int size = 50;

    std::vector<float> a(size);
    std::vector<float> b(size);
    std::vector<float> c(size);

    std::iota(a.begin(), a.end(), 1.0f);
    std::iota(b.begin(), b.end(), 1.0f);

    float* a_dev;
    float* b_dev;
    float* c_dev;

    //cudaMalloc((void**)&a_dev, size * sizeof(float));

    cudaMalloc((void**)&a_dev, size * sizeof(float));
    cudaMalloc((void**)&b_dev, size * sizeof(float));
    cudaMalloc((void**)&c_dev, size * sizeof(float));

    cudaMemcpy(a_dev, a.data(), size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b.data(), size * sizeof(float), cudaMemcpyHostToDevice);

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(a_dev, b_dev, c_dev);
    cudaDeviceSynchronize();

    cudaMemcpy(c.data(), c_dev, size * sizeof(float), cudaMemcpyDeviceToHost);


    for (const auto& value : c) {
        std::cout << value << " ";
    }

    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
}

2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 


In [None]:
%%cuda
#include <stdio.h>
#include <iostream>

__global__ void conv2d(int *input, int *kernel, int *output) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int i_h;
    int i_w;

    int output_index = y * 3 + x;

    if (output_index > 8) {
        printf("Problem");
    } 



    //printf("%u", input[output_index]);

    for (int h = 0; h < 3; ++h) {
        for (int w = 0; w < 3; ++w) {
            // We get the indices for the input image, from the kernel and output indices
            i_h = y - 1 + h;
            i_w = x - 1 + w;

            if (i_h > 0 && i_h < 3 && i_w >0 && i_w < 3) {
              int input_index = i_h * 3 + i_w;
              output[output_index] += input[input_index] * kernel[h * 3 + w];
            }
        }
    }
    //printf("%u", output[output_index]);
}

int main() {


    dim3 blocksPerGrid(1,1,1);
    dim3 threadsPerBlock(3, 3, 1);

    int kernel_size = 3;

    int input[3][3] = {
        {0, 1, 2},
        {3, 4, 5},
        {6, 7, 8},
    };
    int kernel[3][3] = {
        {0, 0, 0},
        {1, 1, 1},
        {-1, -1, -1},
    };

    int output[3][3] = {0};

    int* input_dev;
    int* kernel_dev;
    int* output_dev;


    cudaMalloc((void**)&input_dev, 9 *sizeof(int));
    cudaMalloc((void**)&kernel_dev, 9 *sizeof(int));
    cudaMalloc((void**)&output_dev, 9 *sizeof(int));

    cudaMemcpy(input_dev, input, 9*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(kernel_dev, kernel, 9*sizeof(int), cudaMemcpyHostToDevice);

    conv2d<<<blocksPerGrid, threadsPerBlock>>>(input_dev, kernel_dev, output_dev);
    cudaDeviceSynchronize();

    cudaMemcpy(output, output_dev, 9 * sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < 3; ++i) {
        for (int j = 0; j < 3; ++j) {
            std::cout << output[i][j] << " ";
        }
        std::cout << std::endl;
    }

    cudaFree(input_dev);
    cudaFree(kernel_dev);
    cudaFree(output_dev);
    return 0;

}
    

