In [28]:
%%writefile vector_add.cu
#include <stdio.h>
#include <cstdlib>
using namespace std;

__global__
void vectorAdd(const float *A, const float *B, float *C, int numElements) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < numElements) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    int numElements = 50000;  // Number of elements in each vector
    size_t size = numElements * sizeof(float);
    float *h_A, *h_B, *h_C;  // host copies of A, B, C
    float *d_A, *d_B, *d_C;  // device copies of A, B, C

    // Allocate host memory
    h_A = (float *)malloc(size);
    h_B = (float *)malloc(size);
    h_C = (float *)malloc(size);

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    // Allocate device memory
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy the host input vectors to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Create CUDA events for timing      (optional)
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //Start Record          (optional)
    cudaEventRecord(start);



    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);

     // Stop timing after kernel completion   (optional)
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Calculate elapsed time    (optional)
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Elapsed time: %f ms\n", milliseconds);



    // Copy the device result vector back to the host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);


     // Print some of the results
    printf("Sample results:\n");
    for (int i = 0; i < 10; ++i) {
        printf("C[%d] = %f\n", i, h_C[i]);
    }


    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    // Destroy CUDA events   (optional)
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Overwriting vector_add.cu


In [29]:
!nvcc vector_add.cu -o vector_add


In [30]:
!./vector_add


Elapsed time: 0.207648 ms
Sample results:
C[0] = 1.234571
C[1] = 1.581539
C[2] = 1.109199
C[3] = 1.103452
C[4] = 0.831745
C[5] = 1.106268
C[6] = 0.878185
C[7] = 1.868425
C[8] = 1.353009
C[9] = 0.748571


In [13]:
%%writefile add.cu
#include <iostream>
#include <cstdlib> // Include <cstdlib> for rand()
using namespace std;

__global__
void add(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

void print(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        cout << vector[i] << " ";
    }
    cout << endl;
}

int main() {
    int N;
    cout << "Enter the size of the vectors: ";
    cin >> N;

    int* A, * B, * C;
    int vectorSize = N;
    size_t vectorBytes = vectorSize * sizeof(int);

    // Allocate host memory
    A = new int[vectorSize];
    B = new int[vectorSize];
    C = new int[vectorSize];

    // Initialize host arrays
    cout << "Enter elements of vector A:" << endl;
    for (int i = 0; i < N; i++) {
        cin >> A[i];
    }
    cout << "Enter elements of vector B:" << endl;
    for (int i = 0; i < N; i++) {
        cin >> B[i];
    }
    cout << "Vector A: ";
    print(A, N);
    cout << "Vector B: ";
    print(B, N);

    int* X, * Y, * Z;
    // Allocate device memory
    cudaMalloc(&X, vectorBytes);
    cudaMalloc(&Y, vectorBytes);
    cudaMalloc(&Z, vectorBytes);

    // Check for CUDA memory allocation errors
    if (X == nullptr || Y == nullptr || Z == nullptr) {
        cerr << "CUDA memory allocation failed" << endl;
        return 1;
    }

    // Copy data from host to device
    cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Launch kernel
    add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);

    // Check for kernel launch errors
    cudaError_t kernelLaunchError = cudaGetLastError();
    if (kernelLaunchError != cudaSuccess) {
        cerr << "CUDA kernel launch failed: " << cudaGetErrorString(kernelLaunchError) << endl;
        return 1;
    }

    // Copy result from device to host
    cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);

    // Check for CUDA memcpy errors
    cudaError_t memcpyError = cudaGetLastError();
    if (memcpyError != cudaSuccess) {
        cerr << "CUDA memcpy failed: " << cudaGetErrorString(memcpyError) << endl;
        return 1;
    }

    cout << "Addition: ";
    print(C, N);

    // Free device memory
    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    // Free host memory
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}


Writing add.cu


In [14]:
!nvcc add.cu -o add

In [15]:
!./add

Enter the size of the vectors: 25
Enter elements of vector A:
2 25 6 5 8 5 6 3 4 8 6 4 8 6 5 2 3 5 7 5 6 3 3 2 4 5 6 3 2 5 6 
Enter elements of vector B:
5 6 8 9 4 2 3 7 8 6 5 1 4 2 5 8 9 6 3 14 25 36 30 52 63
Vector A: 2 25 6 5 8 5 6 3 4 8 6 4 8 6 5 2 3 5 7 5 6 3 3 2 4 
Vector B: 5 6 3 2 5 6 5 6 8 9 4 2 3 7 8 6 5 1 4 2 5 8 9 6 3 
Addition: 7 31 9 7 13 11 11 9 12 17 10 6 11 13 13 8 8 6 11 7 11 11 12 8 7 
