In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [6]:
%%cuda --name testGoogleColab.cu
#include <iostream>
#include <cstdlib>

#define N (2048 * 2048)
#define THREADS_PER_BLOCK 512
#define RADIUS 2 // Define the stencil radius

// Function to generate random integers and fill the array
__host__ void random_ints(int* array, int size) {
    // Set the seed for the random number generator
    srand(time(NULL));

    for (int i = 0; i < size; i++) {
        array[i] = rand() % 100; // Generate random integers between 0 and 99 (adjust as needed)
    }
}

__global__ void stencil_1d(int *in, int *out, int n)
{
    __shared__ int temp[THREADS_PER_BLOCK + 2 * RADIUS];
    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;
    temp[lindex] = in[gindex];

    // Check if threadIdx.x is less than RADIUS
    if (threadIdx.x < RADIUS)
    {
        // Ensure that the indices are within bounds before accessing the array
        if (lindex - RADIUS >= 0)
            temp[lindex - RADIUS] = in[gindex - RADIUS];
        if (lindex + THREADS_PER_BLOCK < n)
            temp[lindex + THREADS_PER_BLOCK] = in[gindex + THREADS_PER_BLOCK];
    }
    __syncthreads();

    int result = 0;
    for (int offset = -RADIUS; offset <= RADIUS; offset++)
    {
        int neighborIndex = lindex + offset;
        if (neighborIndex >= 0 && neighborIndex < THREADS_PER_BLOCK + 2 * RADIUS)
            result += temp[neighborIndex];
    }

    // Store the result
    out[gindex] = result;
}


int main(void)
{
    int *a, *b;       // host copies of a, b, c
    int *d_a, *d_b; // device copies of a, b, c
    int size = N * sizeof(int);

    // Allocate space for host copies of a, b, c and setup input values
    a = (int *)malloc(size);
    random_ints(a, N);

    b = (int *)malloc(size);

    cudaMalloc((void **)&d_a, size);
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMalloc((void **)&d_b, size);


    // Launch add() kernel on GPU
    stencil_1d<<<N  / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a, d_b, N);

    // Copy result back to host
    cudaMemcpy(b, d_b, size, cudaMemcpyDeviceToHost);

    // Print part of the result
    std::cout<<"first 30 origin data is:"<<std::endl;
    for (int i = 0; i < 30; i++) {
        std::cout << a[i] << " ";
    }
    std::cout << std::endl;

    std::cout<<"when radius is "<<RADIUS<<"first 30 stenciled data is:"<<std::endl;
    for (int i = 0; i < 30; i++) {
        std::cout << b[i] << " ";
    }
    std::cout << std::endl;

    std::cout<<"last 30 origin data is:"<<std::endl;
    for (int i = N-30; i < N; i++) {
        std::cout << a[i] << " ";
    }
    std::cout << std::endl;

    std::cout<<"when radius is "<<RADIUS<<" last 30 stenciled data is:"<<std::endl;
    for (int i = N-30; i < N; i++) {
        std::cout << b[i] << " ";
    }
    std::cout << std::endl;

    srand(time(NULL));
    int middle =(rand()%100)*(rand()%100);
    std::cout<<"from "<<middle<<" middle 30 origin data is:"<<std::endl;
    for (int i = middle-30; i < middle; i++) {
        std::cout << a[i] << " ";
    }
    std::cout << std::endl;

    std::cout<<"when radius is "<<RADIUS<<" middle 30 stenciled data is:"<<std::endl;
    for (int i = middle-30; i < middle; i++) {
        std::cout << b[i] << " ";
    }
    std::cout << std::endl;

    // Cleanup
    free(a);
    free(b);
    cudaFree(d_a);
    cudaFree(d_b);

    return 0;
}

'File written in /content/src/testGoogleColab.cu'

# New Section

In [7]:
!nvcc -arch=sm_75 -o "/content/src/testGoogleColab.o" /content/src/testGoogleColab.cu

In [8]:
!chmod 755 /content/src/testGoogleColab.o
!/content/src/testGoogleColab.o

first 30 origin data is:
19 12 14 58 16 4 72 82 11 77 58 27 84 10 81 57 69 69 69 49 17 28 53 61 49 25 75 20 78 8 
when radius is 2first 30 stenciled data is:
45 103 119 104 164 232 185 246 300 255 257 256 260 259 301 286 345 313 273 232 216 208 208 216 263 230 247 206 256 230 
last 30 origin data is:
37 70 22 52 95 72 75 33 98 49 30 57 50 50 82 83 36 61 89 37 88 60 81 97 23 4 89 46 86 7 
when radius is 2 last 30 stenciled data is:
225 259 276 311 316 327 373 327 285 267 284 236 269 322 301 312 351 306 311 335 355 363 349 265 294 259 248 232 228 139 
from 7920 middle 30 origin data is:
12 34 70 76 13 77 67 81 52 73 19 98 40 68 39 97 86 59 25 4 21 46 54 90 38 81 10 3 46 68 
when radius is 2 middle 30 stenciled data is:
178 248 205 270 303 314 290 350 292 323 282 298 264 342 330 349 306 271 195 155 150 215 249 309 273 222 178 208 187 188 
