In [1]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_100a -Xptxas=-v')

Source files will be saved in "/tmp/tmps8b_gp8f".


Indeed, the memory model is weak, that is, in paralell, if two threads are reading and writing the same peices of memory there is a data race and the behaviour is undefined. 



In [9]:
%%cuda 

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cuda.h>

constexpr int N = 4;

__global__ void kernel(float*A)
{
  
}

int main()
{
  float *A_d, *A_h; 
  size_t size = sizeof(float)*N; 
  
  cudaHostAlloc(&A_h, size, cudaHostAllocDefault);
  cudaMalloc(&A_d, size);
  for (int i = 0; i < N; i++)
  {
    A_h[i] = (float)(i+1);
  }
  
  cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice); 
  
  printf("====  Before kernel execution  ====== \n");
  for (int i = 0; i < N; i++)
  {
    printf("A[%d]: %f, ", i, A_h[i]);
  }
  printf("\n"); 
  printf("====  After kernel execution  ====== \n");
  
  kernel<<<1,32>>>(A_d);
  
  cudaDeviceSynchronize();
  cudaMemcpy(A_h, A_d, size, cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
  {
    printf("A[%d]: %f, ", i, A_h[i]);
  }
  
  cudaFree(A_d);
  cudaFreeHost(A_h);
  
  
}


A[0]: 1.000000, A[1]: 2.000000, A[2]: 3.000000, A[3]: 4.000000, 
A[0]: 1.000000, A[1]: 2.000000, A[2]: 3.000000, A[3]: 4.000000, 


So below, we have created a data race, each thread reads from the same index 0 and writes a different value to the same index of gmem. this is obviously an issue. so if this 
executing with some serialization, we would get A[0] = A[0] + 0 + 1 + 2 + 3, but since each thread is in paralell, the behaviour is undefined, here nothing is happening A[0] remains unchanged.

In [14]:
%%cuda 

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cuda.h>

constexpr int N = 4;

__global__ void race(float*A)
{
  uint t = threadIdx.x; 
  A[t/N] = A[t/N] + t;

}

int main()
{
  float *A_d, *A_h; 
  size_t size = sizeof(float)*N; 
  
  cudaHostAlloc(&A_h, size, cudaHostAllocDefault);
  cudaMalloc(&A_d, size);
  for (int i = 0; i < N; i++)
  {
    A_h[i] = (float)(i+1);
  }
  
  cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice); 
  
  printf("====  Before kernel execution  ====== \n");
  for (int i = 0; i < N; i++)
  {
    printf("A[%d]: %f, ", i, A_h[i]);
  }
  printf("\n"); 
  printf("====  After kernel execution  ====== \n");
  
  race<<<1,N>>>(A_d);
  
  cudaDeviceSynchronize();
  cudaMemcpy(A_h, A_d, size, cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
  {
    printf("A[%d]: %f, ", i, A_h[i]);
  }
  
  cudaFree(A_d);
  cudaFreeHost(A_h);
  
  
}


A[0]: 1.000000, A[1]: 2.000000, A[2]: 3.000000, A[3]: 4.000000, 
A[0]: 1.000000, A[1]: 2.000000, A[2]: 3.000000, A[3]: 4.000000, 


okay now we deploy 8 threads, we make the first 4 read the things, and then the next 4 write into the same things lets see. 

In [17]:
%%cuda 

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cuda.h>

constexpr int N = 4;

__global__ void race(float*A)
{
  uint t = threadIdx.x; 
  float r[4] = {0.1, 0.2, 0.3, 0.4};
  float q[4] = {0.0};
  
  if (t < 4)
  {
    q[t] = A[t];
  }
  if (t > 3)
  {
    A[t-4] = r[t-4];
  }

}

int main()
{
  float *A_d, *A_h; 
  size_t size = sizeof(float)*N; 
  
  cudaHostAlloc(&A_h, size, cudaHostAllocDefault);
  cudaMalloc(&A_d, size);
  for (int i = 0; i < N; i++)
  {
    A_h[i] = (float)(i+1);
  }
  
  cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice); 
  
  printf("====  Before kernel execution  ====== \n");
  for (int i = 0; i < N; i++)
  {
    printf("A[%d]: %f, ", i, A_h[i]);
  }
  printf("\n"); 
  printf("====  After kernel execution  ====== \n");
  
  race<<<1,2*N>>>(A_d);
  
  cudaDeviceSynchronize();
  cudaMemcpy(A_h, A_d, size, cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
  {
    printf("A[%d]: %f, ", i, A_h[i]);
  }
  
  cudaFree(A_d);
  cudaFreeHost(A_h);
  
  
}


A[0]: 1.000000, A[1]: 2.000000, A[2]: 3.000000, A[3]: 4.000000, 
A[0]: 0.100000, A[1]: 0.200000, A[2]: 0.300000, A[3]: 0.400000, 


huh. No data race, so it means that the process is getting serialized. 