In [52]:
%%writefile step1.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void MulByTwo(int* d_arr, int const size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  //printf("thread index: %d\n", i);

  if (i < size)
    d_arr[i] = d_arr[i] * 2;
}

void InitHostArray(int* h_arr, int const size)
{
  for (int i = 0; i < size; ++i)
    h_arr[i] = i;
}

void checkResult(int* h_arr, int* h_result, int const size) {
  bool match = 1;

  for (int i = 0; i < size; i++)
  {
    if (h_arr[i] * 2 != h_result[i])
    {
      match = 0;
      printf("Arrays do not match!\n");
      printf("host %d gpu %d at current %d\n", h_arr[i], h_result[i], i);

      break;
    }
  }

  if (match)
    printf("Arrays match.\n\n");
}

int main()
{
  int nElem = 1000000000;

  int* h_arr = (int*)malloc(sizeof(int) * nElem);
  int* h_result = (int*)malloc(sizeof(int) * nElem);
  InitHostArray(h_arr, nElem);

  int* d_arr;
  cudaMalloc(&d_arr, sizeof(int) * nElem);

  cudaMemcpy(d_arr, h_arr, sizeof(int) * nElem, cudaMemcpyHostToDevice);

  int iLen = 128;
  dim3 block(iLen);
  printf("Block size is %d\n", iLen);
  dim3 grid((nElem + block.x - 1) / block.x);
  printf("Grid size is %d\n", (nElem + block.x - 1) / block.x);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start);
  MulByTwo<<<grid, block>>>(d_arr, nElem);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);

  float ms;
  cudaEventElapsedTime(&ms, start, stop);
  printf("Elapsed time %f ms\n", ms);

  cudaMemcpy(h_result, d_arr, sizeof(int) * nElem, cudaMemcpyDeviceToHost);

  checkResult(h_arr, h_result, nElem);

  cudaEventDestroy(start);
	cudaEventDestroy(stop);

  cudaFree(d_arr);
  free(h_arr);
  free(h_result);

  return 0;
}


Overwriting step1.cu


In [53]:
!nvcc -arch=sm_75 step1.cu -o step1

In [54]:
!./step1

Block size is 128
Grid size is 7812500
Elapsed time 33.814655 ms
Arrays match.



In [55]:
%%writefile step2.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void warmUp(int* d_arr, int const size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  //printf("thread index: %d\n", i);

  if (i < size)
    d_arr[i] = d_arr[i] * 1;
}

__global__ void MulByTwo(int* d_arr, int const size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  //printf("thread index: %d\n", i);

  if (i < size)
  {
    if (i % 2 == 0)
      d_arr[i] = d_arr[i] * 2;
    else
      d_arr[i] = d_arr[i] * 3;
  }
}

void InitHostArray(int* h_arr, int const size)
{
  for (int i = 0; i < size; ++i)
    h_arr[i] = i;
}

void checkResult(int* h_arr, int* h_result, int const size) {
  bool match = 1;

  for (int i = 0; i < size; i++)
  {
    if ((i % 2 == 0 && h_arr[i] * 2 != h_result[i]) ||
        (i % 2 == 1 && h_arr[i] * 3 != h_result[i]))
    {
      match = 0;
      printf("Arrays do not match!\n");
      printf("host %d gpu %d at current %d\n", h_arr[i], h_result[i], i);

      break;
    }
  }

  if (match)
    printf("Arrays match.\n\n");
}

int main()
{
  int nElem = 1000000000;

  int* h_arr = (int*)malloc(sizeof(int) * nElem);
  int* h_result = (int*)malloc(sizeof(int) * nElem);
  InitHostArray(h_arr, nElem);

  int* d_arr;
  cudaMalloc(&d_arr, sizeof(int) * nElem);

  cudaMemcpy(d_arr, h_arr, sizeof(int) * nElem, cudaMemcpyHostToDevice);

  int iLen = 128;
  dim3 block(iLen);
  printf("Block size is %d\n", iLen);
  dim3 grid((nElem + block.x - 1) / block.x);
  printf("Grid size is %d\n", (nElem + block.x - 1) / block.x);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  warmUp<<<1, 1>>>(d_arr, nElem);
  cudaDeviceSynchronize();

  cudaEventRecord(start);
  MulByTwo<<<grid, block>>>(d_arr, nElem);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);

  float ms;
  cudaEventElapsedTime(&ms, start, stop);
  printf("Elapsed time %.5f ms\n", ms);

  cudaMemcpy(h_result, d_arr, sizeof(int) * nElem, cudaMemcpyDeviceToHost);

  checkResult(h_arr, h_result, nElem);

  cudaEventDestroy(start);
	cudaEventDestroy(stop);

  cudaFree(d_arr);
  free(h_arr);
  free(h_result);

  return 0;
}


Overwriting step2.cu


In [56]:
!nvcc -arch=sm_75 step2.cu -O0 -o step2

In [57]:
!./step2

Block size is 128
Grid size is 7812500
Elapsed time 34.76822 ms
Arrays match.



In [79]:
%%writefile step3.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void warmUp(int* d_arr, int const size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  //printf("thread index: %d\n", warpsize);

  if (i < size)
    d_arr[i] = d_arr[i] * 1;
}

__global__ void MulByTwo(int* d_arr, int const size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  //printf("thread index: %d\n", warpsize);

  if (i < size)
  {
    if ((i / warpSize) % 2 == 0)
      d_arr[i] = d_arr[i] * 2;
    else
      d_arr[i] = d_arr[i] * 3;
  }
}

void InitHostArray(int* h_arr, int const size)
{
  for (int i = 0; i < size; ++i)
    h_arr[i] = i;
}

void checkResult(int* h_arr, int* h_result, int const size)
{
  bool match = 1;

  for (int i = 0; i < size; i++)
  {
    // As the war size is 32
    if ((i / 32 % 2 == 0 && h_arr[i] * 2 != h_result[i]) ||
        (i / 32 % 2 == 1 && h_arr[i] * 3 != h_result[i]))
    {
      match = 0;
      printf("Arrays do not match!\n");
      printf("host %d gpu %d at current %d\n", h_arr[i], h_result[i], i);

      break;
    }
  }

  if (match)
    printf("Arrays match.\n\n");
}

int main()
{
  int nElem = 1000000000;

  int* h_arr = (int*)malloc(sizeof(int) * nElem);
  int* h_result = (int*)malloc(sizeof(int) * nElem);
  InitHostArray(h_arr, nElem);

  int* d_arr;
  cudaMalloc(&d_arr, sizeof(int) * nElem);

  cudaMemcpy(d_arr, h_arr, sizeof(int) * nElem, cudaMemcpyHostToDevice);

  int iLen = 128;
  dim3 block(iLen);
  printf("Block size is %d\n", iLen);
  dim3 grid((nElem + block.x - 1) / block.x);
  printf("Grid size is %d\n", (nElem + block.x - 1) / block.x);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  warmUp<<<grid, block>>>(d_arr, nElem);
  cudaDeviceSynchronize();

  cudaEventRecord(start);
  MulByTwo<<<grid, block>>>(d_arr, nElem);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);

  float ms;
  cudaEventElapsedTime(&ms, start, stop);
  printf("Elapsed time %.5f ms\n", ms);

  cudaMemcpy(h_result, d_arr, sizeof(int) * nElem, cudaMemcpyDeviceToHost);

  checkResult(h_arr, h_result, nElem);

  cudaEventDestroy(start);
	cudaEventDestroy(stop);

  cudaFree(d_arr);
  free(h_arr);
  free(h_result);

  return 0;
}


Overwriting step3.cu


In [80]:
!nvcc -arch=sm_75 step3.cu -O0 -o step3

In [81]:
!./step3

Block size is 128
Grid size is 7812500
Elapsed time 34.19603 ms
Arrays match.

