In [5]:
%%writefile vector_add.cu

#include <iostream>
#include <stdio.h> // Include stdio.h for printf
#include <math.h> // Include math.h for ceil

__global__ void vectorAdd(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
        printf("Thread %d: A[%d] = %f, B[%d] = %f, C[%d] = %f\n", i, i, A[i], i, B[i], i, C[i]);
    }
}

int main() {
    const int N = 10;
    float A[N], B[N], C[N];

    // Initialize host arrays
    for (int i = 0; i < N; ++i) {
        A[i] = i;
        B[i] = i * 2;
    }

    float *d_a, *d_b,*d_c;
    cudaMalloc(&d_a,N*sizeof(float));
    cudaMalloc(&d_b,N*sizeof(float));
    cudaMalloc(&d_c,N*sizeof(float));
    cudaMemcpy(d_a,A,N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,B,N*sizeof(float),cudaMemcpyHostToDevice);

    int blocksize=256;
    int gridsize=ceil((float)N/blocksize); // Cast N to float for ceil

    printf("Launching kernel with grid size %d and block size %d\n", gridsize, blocksize);
    vectorAdd<<<gridsize,blocksize>>>(d_a,d_b,d_c,N);
    cudaDeviceSynchronize(); // Synchronize to ensure all kernel threads complete
    cudaMemcpy(C,d_c,N*sizeof(float),cudaMemcpyDeviceToHost);

    printf("\nResult on host:\n");
    for (int i = 0; i < N; ++i) {
        printf("C[%d] = %f\n", i, C[i]);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

Overwriting vector_add.cu


In [6]:
# Compile with the specified architecture
!nvcc vector_add.cu -o vector_add -gencode arch=compute_75,code=sm_75

# Run the executable
!./vector_add

Launching kernel with grid size 1 and block size 256

Result on host:
C[0] = 0.000000
C[1] = 0.000000
C[2] = 0.000000
C[3] = 0.000000
C[4] = 0.000000
C[5] = 0.000000
C[6] = 0.000000
C[7] = 0.000000
C[8] = 0.000000
C[9] = 0.000000
