```
CEPARCO-S11
GROUP 2
TOPIC: GPU-CPU Memory Transfer
MEMBERS:
    ALONZO, Jose Anton S.
    AVELINO, Joris Gabriel L.
    CRUZ, Airon John R.
    HERNANDEZ, Pierre Vincent C.

```

#IMPLEMENTATION #1: Old Method or Transferring data between CPU and Memory (memCUDA copy)

## Vector Size: `2^20`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_1_256_20.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 20; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 256; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_1_256_20.cu -o CUDA_1_256_20
nvprof ./CUDA_1_256_20

### Number of Threads: `512`

In [None]:
%%writefile CUDA_1_512_20.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 20; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 512; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_1_512_20.cu -o CUDA_1_512_20
nvprof ./CUDA_1_512_20

### Number of Threads: `1024`

In [None]:
%%writefile CUDA_1_1K_20.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 20; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 1024; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_1_1K_20.cu -o CUDA_1_1K_20
nvprof ./CUDA_1_1K_20

## Vector Size: `2^22`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_1_256_22.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 22; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 256; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_1_256_22.cu -o CUDA_1_256_22
nvprof ./CUDA_1_256_22

### Number of Threads: `512`

In [None]:
%%writefile CUDA_1_512_22.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 22; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 512; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

Writing CUDA_1_512_22.cu


In [None]:
%%shell
nvcc CUDA_1_512_22.cu -o CUDA_1_512_20
nvprof ./CUDA_1_512_22

### Number of Threads: `1024`

In [None]:
%%writefile CUDA_1_1K_22.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 22; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 1024; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_1_1K_22.cu -o CUDA_1_1K_22
nvprof ./CUDA_1_1K_22

## Vector Size: `2^24`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_1_256_24.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 24; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 256; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_1_256_24.cu -o CUDA_1_256_24
nvprof ./CUDA_1_256_24

### Number of Threads: `512`

In [None]:
%%writefile CUDA_1_512_24.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 24; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 512; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

Writing CUDA_1_512_24.cu


In [None]:
%%shell
nvcc CUDA_1_512_24.cu -o CUDA_1_512_24
nvprof ./CUDA_1_512_24


-- 1-D Convolution --
Vector size: 16777216
Number of threads: 512
Number of blocks: 32768

==860== NVPROF is profiling process 860, command: ./CUDA_1_512_24
Error count: 0
==860== Profiling application: ./CUDA_1_512_24
==860== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   49.24%  68.031ms         1  68.031ms  68.031ms  68.031ms  [CUDA memcpy DtoH]
                   29.03%  40.110ms         2  20.055ms  15.570ms  24.540ms  [CUDA memcpy HtoD]
                   21.73%  30.028ms        30  1.0009ms  997.91us  1.0035ms  convolution(float*, float*, int)
      API calls:   72.75%  386.52ms         2  193.26ms  223.73us  386.29ms  cudaMalloc
                   26.49%  140.73ms         3  46.910ms  15.791ms  99.976ms  cudaMemcpy
                    0.49%  2.6154ms         2  1.3077ms  336.63us  2.2788ms  cudaFree
                    0.18%  968.94us         1  968.94us  968.94us  968.94us  cuDeviceGetPCIBusId
          



### Number of Threads: `1024`

In [None]:
%%writefile CUDA_1_1K_24.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main ()
{

  // Initialization of constant variables
  const int VECTOR_SIZE = 1 << 24; // Modify depending on the needed value
  const int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  const int THREAD_NUM = 1024; // Modify depending on the needed value
  const int RUN_CNT = 30; // Modify depending on the needed value 

  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM; // Initialize number of blocks

  // Declaration of I/O variables
  float* h_in;
  float* h_out; // Host (i.e. CPU) variables
  float* d_in;
  float* d_out; // Device (i.e. GPU) variables
  
  // Print current specifications
  printf("\n%s\n%s: %d\n%s: %d\n%s: %d\n\n",
  "-- 1-D Convolution --",
  "Vector size", VECTOR_SIZE,
  "Number of threads", THREAD_NUM,
  "Number of blocks", numBlocks
  );

  // STEP 1: Allocate HOST memory

  h_in = (float*)malloc(VECTOR_BYTES);
  h_out = (float*)malloc(VECTOR_BYTES);

  // Initialization of data

  for (int i = 0; i < VECTOR_SIZE; i++){
    h_in[i] = float(i);
  }

  // STEP 2: Allocate DEVICE memory 

  cudaMalloc((void**)&d_in, VECTOR_BYTES);
  cudaMalloc((void**)&d_out, VECTOR_BYTES);

  // STEP 3: Transfer data from host to device memory

  cudaMemcpy(d_in, h_in, VECTOR_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out, h_out, VECTOR_BYTES, cudaMemcpyHostToDevice);

  // STEP 4: Execute (convolution) kernel

  for (int j = 0; j < RUN_CNT; j++)
    convolution<<<numBlocks, THREAD_NUM>>>(d_in, d_out, VECTOR_SIZE);

  // STEP 5: Transfer data back to host memory

  cudaMemcpy(h_out, d_out, VECTOR_BYTES, cudaMemcpyDeviceToHost);

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < VECTOR_SIZE - 2; k++) 
  {
    if ( (h_in[k] + h_in[k+1] + h_in[k+2]) / 3.0f != h_out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // STEP 6: Deallocate HOST memory

  free(h_in);
  free(h_out);
  
  // STEP 7: Deallocate DEVICE memory

  cudaFree(d_in);
  cudaFree(d_out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_1_1K_24.cu -o CUDA_1_1K_24
nvprof ./CUDA_1_1K_24

#IMPLEMENTATION #2: Unified Memory Introduced in CUDA 6

## Vector Size: `2^20`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 20; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 256; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();
  /*
  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }
  */
  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

Overwriting CUDA_2.cu


In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

==8580== NVPROF is profiling process 8580, command: ./CUDA_2
Error count: 0
==8580== Profiling application: ./CUDA_2
==8580== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  4.5540ms        30  151.80us  60.800us  2.7832ms  convolution(float*, float*, int)
      API calls:   96.68%  157.15ms         2  78.573ms  37.068us  157.11ms  cudaMallocManaged
                    2.69%  4.3679ms         1  4.3679ms  4.3679ms  4.3679ms  cudaDeviceSynchronize
                    0.39%  628.11us         2  314.05us  275.58us  352.53us  cudaFree
                    0.15%  248.19us        30  8.2720us  4.5380us  46.760us  cudaLaunchKernel
                    0.07%  120.70us       101  1.1950us     140ns  51.514us  cuDeviceGetAttribute
                    0.02%  26.914us         1  26.914us  26.914us  26.914us  cuDeviceGetName
                    0.00%  6.2700us         1  6.2700us  6.2700us  6.2700us  cuDeviceGetPCIBusId
  



### Number of Threads: `512`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 20; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 512; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();

  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

### Number of Threads: `1024`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 20; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 1024; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();

  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

## Vector Size: `2^22`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 22; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 256; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();

  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

### Number of Threads: `512`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 22; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 512; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();

  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

### Number of Threads: `1024`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 22; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 1024; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();

  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

## Vector Size: `2^24`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 24; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 256; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();

  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

### Number of Threads: `512`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 24; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 512; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();

  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

### Number of Threads: `1024`

In [None]:
%%writefile CUDA_2.cu

#include<stdio.h>
#include<stdlib.h>

__global__ 
void convolution(float* in, float* out, int size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < size - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}

int main(){

  // Initialization of constant variables
  const int vector = 1 << 24; //AKA ARRAY_SIZE (Modify depending on the needed value)
  const int ARRAY_BYTES = vector * sizeof(float);
  const int threads = 1024; //Modify depending on the needed value
  const int count = 30;
  
  // Declaration of I/O variables/arrays
  float* in;
  float* out;
  
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  // Data Initialization 
  for (int i = 0; i < vector; i++){
    in[i] = float(i);
  }

  //Execute kernel
  int numBlocks = (vector + threads - 1)/threads;
  for (int j = 0; j < count; j++){
    convolution<<<numBlocks, threads>>>(in, out, vector);
  }
  
  cudaDeviceSynchronize();

  for (int i = 0; i < vector - 2; i++){
    printf("%.6f \n", out[i]);
  }

  // Error Checking
  int errorCount = 0;
  for(int k = 0; k < vector - 2; k++) 
  {
    if ( (in[k] + in[k+1] + in[k+2]) / 3.0f != out[k])
      errorCount++;
  }

  printf("Error count: %d\n", errorCount);

  // Deallocate DEVICE memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc CUDA_2.cu -o CUDA_2
nvprof ./CUDA_2

#IMPLEMENTATION #3: Prefetching of Data with Memory Advice

## Vector Size: `2^20`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_3a.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<20; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 256;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3a.cu -o CUDA_3a
nvprof ./CUDA_3a

### Number of Threads: `512`

In [None]:
%%writefile CUDA_3b.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<20; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 512;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3b.cu -o CUDA_3b
nvprof ./CUDA_3b

### Number of Threads: `1024`

In [None]:
%%writefile CUDA_3c.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<20; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 1024;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3c.cu -o CUDA_3c
nvprof ./CUDA_3c

## Vector Size: `2^22`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_3d.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<22; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 256;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3d.cu -o CUDA_3d
nvprof ./CUDA_3d

### Number of Threads: `512`

In [None]:
%%writefile CUDA_3e.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<22; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 512;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3e.cu -o CUDA_3e
nvprof ./CUDA_3e

### Number of Threads: `1024`

In [None]:
%%writefile CUDA_3f.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<22; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 1024;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3f.cu -o CUDA_3f
nvprof ./CUDA_3f

## Vector Size: `2^24`

### Number of Threads: `256`

In [None]:
%%writefile CUDA_3g.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<24; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 256;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3g.cu -o CUDA_3g
nvprof ./CUDA_3g

### Number of Threads: `512`

In [None]:
%%writefile CUDA_3h.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<24; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 512;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3h.cu -o CUDA_3h
nvprof ./CUDA_3h

### Number of Threads: `1024`

In [None]:
%%writefile CUDA_3i.cu

//with maximum pre-fetching 

#include <stdio.h>
#include <stdlib.h>

//CUDA square kernel
__global__
void conv(int n, float* d_out, float* d_in){
  int index = blockIdx.x*blockDim.x + threadIdx.x;
  int stride = blockDim.x*gridDim.x;
  for(int i=index; i<n-2; i+=stride){
    d_out[i] = (d_in[i] + d_in[i+1] + d_in[i+2])/3.0f;
  }

}

int main(){
  
  const unsigned int ARRAY_SIZE = 1<<24; 
  const unsigned ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
  const int runs = 30;

  //declary ARRAY
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

// --- prefetch the data #1---- //
  int device = -1; //garbage value (it means NO GPU found)
  cudaGetDevice(&device); //this will change the valeu of device 
  printf("Device # = %d\n", device);
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,device,NULL);
 //Advise GPU that the 'in' array stays in the host(CPU) memory -- "copy and paste"
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  // Asynchronously transfer data ahead of time from GPU memory back to the host memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, cudaCpuDeviceId, NULL); // NULL means run one stream
 //Asynchronously transfer data ahead of time to the GPU memory -- "cut and paste"
  cudaMemPrefetchAsync(out, ARRAY_BYTES,device,NULL); //null means run one stream

  //initialize data
  for (int i=0; i<ARRAY_SIZE; i++)
    in[i] = float(i);

  // -- Prefetch data part #2--
  //Advise GPU that the 'in' array is read-only
  cudaMemAdvise(in, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  //Copy and paste data from host memory to GPU memory
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL); //NULL means run one stream


  //start here
  int numThreads = 1024;
  int numBlocks = (ARRAY_SIZE+numThreads-1) / numThreads;
  for(int i=0; i<runs; i++){
    conv<<<numBlocks, numThreads>>>(ARRAY_SIZE,out,in);
  }

  printf("numThreads = %d, numBlocks = %d\n", numThreads, numBlocks); 
  //wait for the GPU to completerun/execution
  cudaDeviceSynchronize();

  //Prefetch data #3 again to omit the instance of PAGEFAULT
 // cudaMemPrefetchAsync(in, ARRAY_BYTES,cudaCpuDeviceId,NULL);
  cudaMemPrefetchAsync(out, ARRAY_BYTES,cudaCpuDeviceId,NULL);

  //check for errors
  unsigned int err_count = 0;
  for(int i=0; i<ARRAY_SIZE-2; i++){
    if((in[i]+in[i+1]+in[i+2])/3.0f != out[i])
      err_count++;
  }
  printf("\n Error count (CUDA program): %d\n", err_count);


  //free memory
  cudaFree(in);
  cudaFree(out);
  
  return 0;
}

In [None]:
%%shell
nvcc CUDA_3i.cu -o CUDA_3i
nvprof ./CUDA_3i

#IMPLEMENTATION #4: Data Transfer or Initialization as a CUDA Kernel

## Vector Size: `2^20`

### Number of Threads: `256`

In [None]:
%%writefile cuda_4_2r20_256.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^20
// Number of Threads: 256

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 20;
  const unsigned int THREAD_NUM = 256; 
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r20_256.cu -o cuda_4_2r20_256

In [None]:
%%shell
nvprof ./cuda_4_2r20_256

### Number of Threads: `512`

In [None]:
%%writefile cuda_4_2r20_512.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^20
// Number of Threads: 512

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 20;
  const unsigned int THREAD_NUM = 512;
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r20_512.cu -o cuda_4_2r20_512

In [None]:
%%shell
nvprof ./cuda_4_2r20_512

### Number of Threads: `1024`

In [None]:
%%writefile cuda_4_2r20_1024.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^20
// Number of Threads: 1024

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 20;
  const unsigned int THREAD_NUM = 1024;
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r20_1024.cu -o cuda_4_2r20_1024

In [None]:
%%shell
nvprof ./cuda_4_2r20_1024

## Vector Size: `2^22`

### Number of Threads: `256`

In [None]:
%%writefile cuda_4_2r22_256.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^22
// Number of Threads: 256

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 22;
  const unsigned int THREAD_NUM = 256;
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r22_256.cu -o cuda_4_2r22_256

In [None]:
%%shell
nvprof ./cuda_4_2r22_256

### Number of Threads: `512`

In [None]:
%%writefile cuda_4_2r22_512.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^22
// Number of Threads: 512

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 22;
  const unsigned int THREAD_NUM = 512;
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r22_512.cu -o cuda_4_2r22_512

In [None]:
%%shell
nvprof ./cuda_4_2r22_512

### Number of Threads: `1024`

In [None]:
%%writefile cuda_4_2r22_1024.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^22
// Number of Threads: 1024

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 22;
  const unsigned int THREAD_NUM = 1024;
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r22_1024.cu -o cuda_4_2r22_1024

In [None]:
%%shell
nvprof ./cuda_4_2r22_1024

## Vector Size: `2^24`

### Number of Threads: `256`

In [None]:
%%writefile cuda_4_2r24_256.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^24
// Number of Threads: 256

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 24;
  const unsigned int THREAD_NUM = 256;
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r24_256.cu -o cuda_4_2r24_256

In [None]:
%%shell
nvprof ./cuda_4_2r24_256

### Number of Threads: `512`

In [None]:
%%writefile cuda_4_2r24_512.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^24
// Number of Threads: 512

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 24;
  const unsigned int THREAD_NUM = 512;
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r24_512.cu -o cuda_4_2r24_512

In [None]:
%%shell
nvprof ./cuda_4_2r24_512

### Number of Threads: `1024`

In [None]:
%%writefile cuda_4_2r24_1024.cu
// Data Transfer method: Initialization as a CUDA Kernel
// Vector size: 2^24
// Number of Threads: 1024

#include <stdio.h>
#include <stdlib.h>


// Initialize data in CUDA kernel
__global__
void init(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride) {
    in[i] = float(i);
    out[i] = float(0);
  }
}

// 1-Dimensional convolution
__global__
void conv_1d(int n, float *out, float *in) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n - 2; i += stride) {
    out[i] = (in[i] + in[i+1] + in[i+2]) / 3.0f;
  }
}


int main() {
  // Initialize specs for program
  const unsigned int RUNS = 30;
  const unsigned int VECTOR_SIZE = 1 << 24;
  const unsigned int THREAD_NUM = 1024;
  const unsigned int VECTOR_BYTES = VECTOR_SIZE * sizeof(float);
  int numBlocks = (VECTOR_SIZE + THREAD_NUM - 1) / THREAD_NUM;

  printf("\n%s\n\n%s\n%s: %u\n%s: %d\n%s: %d\n\n",
    "#=== GPU-CPU Memory Transfer ===#",
    "-- 1-D Convolution --",
    "Vector size", VECTOR_SIZE,
    "Number of threads", THREAD_NUM,
    "Number of blocks", numBlocks
  );

  // Allocate memory
  float *in, *out;
  cudaMallocManaged(&in, VECTOR_BYTES);
  cudaMallocManaged(&out, VECTOR_BYTES);

  // Initialize data in CUDA kernel
  init<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Run kernel for 1-D convolution
  for(int i = 0; i < RUNS; i++)
    conv_1d<<<numBlocks, THREAD_NUM>>>(VECTOR_SIZE, out, in);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors
  int errorCount = 0;
  unsigned int max = VECTOR_SIZE - 2;
  for(int i = 0; i < max; i++) {
    if((in[i] + in[i+1] + in[i+2]) / 3.0f != out[i])
      errorCount++;
  }
  printf("Error count: %d\n", errorCount);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

In [None]:
%%shell
nvcc cuda_4_2r24_1024.cu -o cuda_4_2r24_1024

In [None]:
%%shell
nvprof ./cuda_4_2r24_1024