In [1]:
%load_ext nvcc4jupyter

from nvcc4jupyter import set_defaults
set_defaults(compiler_args='-arch=sm_100a -Xptxas=-v')

Source files will be saved in "/tmp/tmpr0qgjj1d".


Indeed, the memory model is weak, that is, in paralell, if two threads are reading and writing the same peices of memory there is a data race and the behaviour is undefined. 



In [2]:
%%cuda 

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cuda.h>


__device__ int X = 10; 

__global__ void memory_race_race()
{
  uint t = threadIdx.x; 

  X = X + t + 1; 
}

int main()
{
  int h_X; // Host copy
  int expected_X = 10;
  // Sum of (t + 1) for t=0 to 31 is (1+2+...+32) = 528.
  // Expected final value is 10 + 528 = 538 (if sequential).
  
  memory_race_race<<<1, 32>>>();
  cudaDeviceSynchronize();
  
  // Copy final result back to host
// CORRECT
  cudaMemcpy(&h_X, &X, sizeof(int), cudaMemcpyDeviceToHost);

  printf("Expected Final X (if sequential): %d\n", 538);
  printf("Actual Final X (data race result): %d\n", h_X);
  
  return 0;
}

Expected Final X (if sequential): 538
Actual Final X (data race result): 0



In [3]:
%%cuda 

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cuda.h>


__device__ int X = 1, Y=2, A = 5, B = 10; 

__device__ void writeXY()
{
    X = 10;
    Y = 20;
}

__device__ void readXY()
{
    B = Y;
    A = X;
}

__global__ void memory_race_race()
{
  uint t = threadIdx.x; 
  if (t == 0)
  {
    writeXY();
  }
  if (t == 1)
  {
    readXY();
  }
}

int main()
{
  int h_X; // Host copy
  int h_A;
  int h_B;
  int h_Y;
  memory_race_race<<<1, 2>>>();
  cudaDeviceSynchronize();
  
  // Copy final result back to host
// CORRECT
  cudaMemcpy(&h_X, &X, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_Y, &Y, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_A, &A, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_B, &B, sizeof(int), cudaMemcpyDeviceToHost);

  printf("X: %d Y: %d A: %d B: %d\n", h_X, h_Y, h_A, h_B);
  
  return 0;
}

X: 0 Y: 0 A: 0 B: 0



In [40]:
%%cuda 

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cuda.h>


__device__ int X = 1, Y=2, A = 5, B = 10; 

__device__ void writeXY()
{
    X = 10;
    __threadfence();
    Y = 20;
    __threadfence();
}

__device__ void readXY()
{
    A = X;
    __threadfence();
    B = Y;
     __threadfence();
    
}

__global__ void memory_race_race()
{
  uint t = threadIdx.x; 
  if (t == 0)
  {
    writeXY();
  }
  if (t == 1)
  {
    readXY();
  }
}

int main()
{
  int h_X; // Host copy
  int h_A;
  int h_B;
  int h_Y;
  memory_race_race<<<1, 2>>>();
  cudaDeviceSynchronize();
  
  // Copy final result back to host
// CORRECT
  cudaMemcpy(&h_X, &X, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_Y, &Y, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_A, &A, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_B, &B, sizeof(int), cudaMemcpyDeviceToHost);

  printf("X: %d Y: %d A: %d B: %d\n", h_X, h_Y, h_A, h_B);
  
  return 0;
}

X: 0 Y: 0 A: 0 B: 0



In [6]:
%%cuda 

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cuda.h>


__device__ int X = 1, Y=2, A = 5, B = 10; 

__device__ void writeXY()
{
    X = 10;

    Y = 20;
    
}

__device__ void readXY()
{
    A = X;

    B = Y;

    
}

__global__ void memory_race_race()
{
  uint t = threadIdx.x; 
  if (t == 0)
  {
    writeXY();
  }
  
  __syncthreads();
  printf("X: %d \n", X);
  if (t == 1)
  {
    readXY();
  }
}

int main()
{
  int h_X; // Host copy
  int h_A;
  int h_B;
  int h_Y;
  memory_race_race<<<1, 2>>>();
  cudaDeviceSynchronize();
  
  // Copy final result back to host
// CORRECT
  cudaMemcpy(&h_X, &X, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_Y, &Y, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_A, &A, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_B, &B, sizeof(int), cudaMemcpyDeviceToHost);

  printf("X: %d Y: %d A: %d B: %d\n", h_X, h_Y, h_A, h_B);
  
  return 0;
}

X: 10 
X: 10 
X: 0 Y: 0 A: 0 B: 0



In [7]:
%%cuda 

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cuda.h>

// 1. Declare device variables without initialization
__device__ int X, Y, A, B; 

__device__ void writeXY()
{
    X = 10;
    Y = 20;
}

__device__ void readXY()
{
    B = Y; // Read Y's value into B
    A = X; // Read X's value into A
}

// ðŸ’¥ Race Condition Kernel (Still a race on consistency!)
__global__ void memory_race_race()
{
  uint t = threadIdx.x; 
  if (t == 0)
  {
    writeXY();
  }
  if (t == 1)
  {
    readXY();
  }
}

int main()
{
    // Host-side initialization values
    int host_X = 1, host_Y = 2;
    int host_A = 5, host_B = 10;
    
    // 2. Copy initial values from host to device global variables
    cudaMemcpyToSymbol(X, &host_X, sizeof(int));
    cudaMemcpyToSymbol(Y, &host_Y, sizeof(int));
    cudaMemcpyToSymbol(A, &host_A, sizeof(int));
    cudaMemcpyToSymbol(B, &host_B, sizeof(int));

    // 3. Launch the kernel
    memory_race_race<<<1, 2>>>();
    cudaDeviceSynchronize();
    
    // 4. Copy final results from device global variables to host
    cudaMemcpyFromSymbol(&host_X, X, sizeof(int));
    cudaMemcpyFromSymbol(&host_Y, Y, sizeof(int));
    cudaMemcpyFromSymbol(&host_A, A, sizeof(int));
    cudaMemcpyFromSymbol(&host_B, B, sizeof(int));

    // The output will now be non-zero and non-deterministic (a race result).
    printf("X: %d Y: %d A: %d B: %d\n", host_X, host_Y, host_A, host_B);
  
    return 0;
}

X: 10 Y: 20 A: 10 B: 20

