**Q1. Write a simple CUDA kernel that takes an array of integers and doubles each element.**

In [4]:
! touch add_basic.cu
"""
#include <iostream>
#include <cuda_runtime.h>

using namespace std;

__global__ void add_basic(int n, int *x)
{
    for (int i = 0; i < n; i++)
    {
        x[i] *= 2;
    }
}

int main()
{
    int N = 10;
    int *x;

    cudaMallocManaged(&x, N * sizeof(int));

    for (int i = 0; i < N; i++)
    {
        x[i] = i + 1;
    }

    cout << "Before Doubling = First 2 elements of the array:" << endl;
    for (int i = 0; i < 2; ++i)
    {
        cout << x[i] << " ";
    }
    cout << endl;

    add_basic<<<1, 1>>>(N, x);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    cout << "After Doubling = First 2 elements of the array:" << endl;
    for (int i = 0; i < 2; ++i)
    {
        cout << x[i] << " ";
    }
    cout << endl;

    cudaFree(x);

    return 0;
}
"""

'\n#include <iostream>\n#include <cuda_runtime.h>\n\nusing namespace std;\n\n__global__ void add_basic(int n, int *x)\n{\n    for (int i = 0; i < n; i++)\n    {\n        x[i] *= 2;\n    }\n}\n\nint main()\n{\n    int N = 10;\n    int *x;\n\n    cudaMallocManaged(&x, N * sizeof(int));\n\n    for (int i = 0; i < N; i++)\n    {\n        x[i] = i + 1;\n    }\n\n    cout << "Before Doubling = First 2 elements of the array:" << endl;\n    for (int i = 0; i < 2; ++i)\n    {\n        cout << x[i] << " ";\n    }\n    cout << endl;\n\n    add_basic<<<1, 1>>>(N, x);\n\n    // Wait for GPU to finish before accessing on host\n    cudaDeviceSynchronize();\n\n    cout << "After Doubling = First 2 elements of the array:" << endl;\n    for (int i = 0; i < 2; ++i)\n    {\n        cout << x[i] << " ";\n    }\n    cout << endl;\n\n    cudaFree(x);\n\n    return 0;\n}\n'

**Q2. Write a CUDA kernel to initialize an array of integers with the index value.**

In [5]:
! touch add_basic.cu
"""
#include <iostream>
#include <cuda_runtime.h>

using namespace std;

__global__ void initialize_array(int *array, int size)
{
    // Calculate the index for the current thread
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < size)
    {
        array[index] = index;
    }
    // COMPLETE THIS
}

int main()
{
    const int array_size = 10;
    int *d_array;

    // Allocate memory on GPU
    cudaMalloc((void **)&d_array, array_size * sizeof(int));

    // Launch the CUDA kernel to initialize the array
    initialize_array<<<1, array_size>>>(d_array, array_size);

    // Copy data from device to host
    int h_array[array_size];
    cudaMemcpy(h_array, d_array, array_size * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the initialized array
    cout << "Initialized Array:" << endl;
    for (int i = 0; i < array_size; ++i)
    {
        cout << h_array[i] << " ";
    }
    cout << endl;

    // Free GPU memory
    cudaFree(d_array);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    return 0;
}
"""

'\n#include <iostream>\n#include <cuda_runtime.h>\n\nusing namespace std;\n\n__global__ void initialize_array(int *array, int size)\n{\n    // Calculate the index for the current thread\n    int index = blockIdx.x * blockDim.x + threadIdx.x;\n    if (index < size)\n    {\n        array[index] = index;\n    }\n    // COMPLETE THIS\n}\n\nint main()\n{\n    const int array_size = 10;\n    int *d_array;\n\n    // Allocate memory on GPU\n    cudaMalloc((void **)&d_array, array_size * sizeof(int));\n\n    // Launch the CUDA kernel to initialize the array\n    initialize_array<<<1, array_size>>>(d_array, array_size);\n\n    // Copy data from device to host\n    int h_array[array_size];\n    cudaMemcpy(h_array, d_array, array_size * sizeof(int), cudaMemcpyDeviceToHost);\n\n    // Print the initialized array\n    cout << "Initialized Array:" << endl;\n    for (int i = 0; i < array_size; ++i)\n    {\n        cout << h_array[i] << " ";\n    }\n    cout << endl;\n\n    // Free GPU memory\n    cuda

**Q3 [OPTIONAL]. How do you check for and handle errors in CUDA API calls and kernel launches?**

In [6]:
! touch add_basic_error_handling.cu
"""
#include <iostream>
#include <cuda_runtime.h>

using namespace std;

#define CHECK_CUDA_ERROR(err)                                          \
    {                                                                  \
        if (err != cudaSuccess)                                        \
        {                                                              \
            cerr << "CUDA Error: " << cudaGetErrorString(err) << endl; \
            exit(EXIT_FAILURE);                                        \
        }                                                              \
    }

__global__ void initialize_array(int *array, int size)
{
    // Calculate the index for the current thread
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < size)
    {
        array[index] = index;
    }
    // COMPLETE THIS
}

int main()
{
    const int array_size = 10;
    int *d_array;

    // Allocate memory on GPU
    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_array, array_size * sizeof(int)));

    // Launch the CUDA kernel to initialize the array
    initialize_array<<<1, array_size>>>(d_array, array_size);
    CHECK_CUDA_ERROR(cudaGetLastError());

    // Copy data from device to host
    int h_array[array_size];
    CHECK_CUDA_ERROR(cudaMemcpy(h_array, d_array, array_size * sizeof(int), cudaMemcpyDeviceToHost));

    // Print the initialized array
    cout << "Initialized Array:" << endl;
    for (int i = 0; i < array_size; ++i)
    {
        cout << h_array[i] << " ";
    }
    cout << endl;

    // Free GPU memory
    CHECK_CUDA_ERROR(cudaFree(d_array));

    // Wait for GPU to finish before accessing on host
    CHECK_CUDA_ERROR(cudaDeviceSynchronize());

    return 0;
}
"""

'\n#include <iostream>\n#include <cuda_runtime.h>\n\nusing namespace std;\n\n#define CHECK_CUDA_ERROR(err)                                                        {                                                                                    if (err != cudaSuccess)                                                          {                                                                                    cerr << "CUDA Error: " << cudaGetErrorString(err) << endl;             exit(EXIT_FAILURE);                                                          }                                                                            }\n\n__global__ void initialize_array(int *array, int size)\n{\n    // Calculate the index for the current thread\n    int index = blockIdx.x * blockDim.x + threadIdx.x;\n    if (index < size)\n    {\n        array[index] = index;\n    }\n    // COMPLETE THIS\n}\n\nint main()\n{\n    const int array_size = 10;\n    int *d_array;\n\n    // Allocate memory on GP