In [4]:
!nvidia-smi
!nvcc --version


Mon Oct  6 18:00:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Part 1 – Memory Allocation and Copy

In [5]:
%%writefile part1_memory.cu
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    const int N = 8;  // small for display
    float A[N], B[N];
    for(int i=0;i<N;i++){ A[i]=i; B[i]=2*i; }

    float *d_A, *d_B;
    cudaMalloc(&d_A, N*sizeof(float));
    cudaMalloc(&d_B, N*sizeof(float));

    cudaMemcpy(d_A, A, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N*sizeof(float), cudaMemcpyHostToDevice);

    printf("=== Part 1: Memory Allocation ===\\n");
    printf("Host A: "); for(int i=0;i<N;i++) printf("%.1f ", A[i]);
    printf("\\nHost B: "); for(int i=0;i<N;i++) printf("%.1f ", B[i]);
    printf("\\nGPU memory allocated & data copied successfully.\\n");

    cudaFree(d_A); cudaFree(d_B);
    return 0;
}


Writing part1_memory.cu


In [6]:
!nvcc -arch=sm_75 part1_memory.cu -o part1 && ./part1


=== Part 1: Memory Allocation ===\nHost A: 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 \nHost B: 0.0 2.0 4.0 6.0 8.0 10.0 12.0 14.0 \nGPU memory allocated & data copied successfully.\n

# Part 2 – Serial Kernel Execution

In [7]:
%%writefile part2_serial.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel1(float *A,float *B,float *C,int N){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i<N) C[i]=A[i]+B[i];
}
__global__ void kernel2(float *C,float *D,int N){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i<N) D[i]=C[i]*C[i];
}

int main(){
    const int N=8;
    float A[N],B[N],C[N],D[N];
    for(int i=0;i<N;i++){A[i]=i;B[i]=2*i;}

    float *dA,*dB,*dC,*dD;
    cudaMalloc(&dA,N*sizeof(float));
    cudaMalloc(&dB,N*sizeof(float));
    cudaMalloc(&dC,N*sizeof(float));
    cudaMalloc(&dD,N*sizeof(float));

    cudaMemcpy(dA,A,N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(dB,B,N*sizeof(float),cudaMemcpyHostToDevice);

    kernel1<<<1,N>>>(dA,dB,dC,N);
    kernel2<<<1,N>>>(dC,dD,N);
    cudaMemcpy(D,dD,N*sizeof(float),cudaMemcpyDeviceToHost);

    printf("=== Part 2: Serial Execution ===\\nD: ");
    for(int i=0;i<N;i++) printf("%.1f ",D[i]);
    printf("\\n");

    cudaFree(dA);cudaFree(dB);cudaFree(dC);cudaFree(dD);
}


Writing part2_serial.cu


In [8]:
!nvcc -arch=sm_75 part2_serial.cu -o part2 && ./part2


      float A[N],B[N],C[N],D[N];
                      ^


=== Part 2: Serial Execution ===\nD: 0.0 9.0 36.0 81.0 144.0 225.0 324.0 441.0 \n

# Part 3 – Streams and Race Conditions

In [9]:
%%writefile part3_streams.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel1(float *A,float *B,float *C,int N){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i<N) C[i]=A[i]+B[i];
}
__global__ void kernel2(float *C,float *D,int N){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i<N) D[i]=C[i]*C[i];
}

int main(){
    const int N=8;
    float A[N],B[N],C[N],D[N];
    for(int i=0;i<N;i++){A[i]=i;B[i]=2*i;}

    float *dA,*dB,*dC,*dD;
    cudaMalloc(&dA,N*sizeof(float));
    cudaMalloc(&dB,N*sizeof(float));
    cudaMalloc(&dC,N*sizeof(float));
    cudaMalloc(&dD,N*sizeof(float));
    cudaMemcpy(dA,A,N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(dB,B,N*sizeof(float),cudaMemcpyHostToDevice);

    cudaStream_t s1,s2; cudaStreamCreate(&s1); cudaStreamCreate(&s2);
    int half=N/2;

    kernel1<<<1,half,0,s1>>>(dA,dB,dC,half);
    kernel2<<<1,half,0,s1>>>(dC,dD,half);

    kernel1<<<1,half,0,s2>>>(dA+half,dB+half,dC+half,half);
    kernel2<<<1,half,0,s2>>>(dC+half,dD+half,half);

    cudaStreamSynchronize(s1);
    cudaStreamSynchronize(s2);
    cudaMemcpy(D,dD,N*sizeof(float),cudaMemcpyDeviceToHost);

    printf("=== Part 3: Streams (safe split) ===\\nD: ");
    for(int i=0;i<N;i++) printf("%.1f ",D[i]);
    printf("\\n");

    cudaFree(dA);cudaFree(dB);cudaFree(dC);cudaFree(dD);
    cudaStreamDestroy(s1);cudaStreamDestroy(s2);
}


Writing part3_streams.cu


In [10]:
!nvcc -arch=sm_75 part3_streams.cu -o part3 && ./part3


      float A[N],B[N],C[N],D[N];
                      ^


=== Part 3: Streams (safe split) ===\nD: 0.0 9.0 36.0 81.0 144.0 225.0 324.0 441.0 \n

# Part 4 – Synchronization

In [11]:
%%writefile part4_sync.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel(float *A,float *B,float *C,int N){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i<N) C[i]=A[i]+B[i];
}

int main(){
    const int N=8;
    float A[N],B[N],C[N]={0};
    for(int i=0;i<N;i++){A[i]=i;B[i]=2*i;}
    float *dA,*dB,*dC; cudaMalloc(&dA,N*sizeof(float));
    cudaMalloc(&dB,N*sizeof(float)); cudaMalloc(&dC,N*sizeof(float));
    cudaMemcpy(dA,A,N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(dB,B,N*sizeof(float),cudaMemcpyHostToDevice);

    kernel<<<1,N>>>(dA,dB,dC,N);
    // No cudaDeviceSynchronize() yet
    cudaMemcpy(C,dC,N*sizeof(float),cudaMemcpyDeviceToHost); // may be incomplete
    printf("=== Part 4: Without Sync ===\\nC: ");
    for(int i=0;i<N;i++) printf("%.1f ",C[i]); printf("\\n");

    kernel<<<1,N>>>(dA,dB,dC,N);
    cudaDeviceSynchronize(); // ensure finish
    cudaMemcpy(C,dC,N*sizeof(float),cudaMemcpyDeviceToHost);
    printf("=== With cudaDeviceSynchronize() ===\\nC: ");
    for(int i=0;i<N;i++) printf("%.1f ",C[i]); printf("\\n");

    cudaFree(dA);cudaFree(dB);cudaFree(dC);
}


Writing part4_sync.cu


In [12]:
!nvcc -arch=sm_75 part4_sync.cu -o part4 && ./part4


=== Part 4: Without Sync ===\nC: 0.0 3.0 6.0 9.0 12.0 15.0 18.0 21.0 \n=== With cudaDeviceSynchronize() ===\nC: 0.0 3.0 6.0 9.0 12.0 15.0 18.0 21.0 \n

# Part 5 – Thread Hierarchy

In [13]:
%%writefile part5_threads.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void info(int N){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i<8)
    printf("blockIdx=%d threadIdx=%d -> i=%d\\n",blockIdx.x,threadIdx.x,i);
}

int main(){
  const int N=1024;
  printf("=== Part 5: Thread hierarchy ===\\n<<<1,%d>>>:\\n",N);
  info<<<1,N>>>(N);
  cudaDeviceSynchronize();
  printf("\\n<<<%d,32>>>:\\n",N/32);
  info<<<N/32,32>>>(N);
  cudaDeviceSynchronize();
}


Writing part5_threads.cu


In [14]:
!nvcc -arch=sm_75 part5_threads.cu -o part5 && ./part5


=== Part 5: Thread hierarchy ===\n<<<1,1024>>>:\nblockIdx=0 threadIdx=0 -> i=0\nblockIdx=0 threadIdx=1 -> i=1\nblockIdx=0 threadIdx=2 -> i=2\nblockIdx=0 threadIdx=3 -> i=3\nblockIdx=0 threadIdx=4 -> i=4\nblockIdx=0 threadIdx=5 -> i=5\nblockIdx=0 threadIdx=6 -> i=6\nblockIdx=0 threadIdx=7 -> i=7\n\n<<<32,32>>>:\nblockIdx=0 threadIdx=0 -> i=0\nblockIdx=0 threadIdx=1 -> i=1\nblockIdx=0 threadIdx=2 -> i=2\nblockIdx=0 threadIdx=3 -> i=3\nblockIdx=0 threadIdx=4 -> i=4\nblockIdx=0 threadIdx=5 -> i=5\nblockIdx=0 threadIdx=6 -> i=6\nblockIdx=0 threadIdx=7 -> i=7\n

# Part 6 – Reduction (Shared Memory + atomicAdd)

In [15]:
%%writefile part6_reduction.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void reduce(const float *in, float *out, int N){
    extern __shared__ float s[];
    int tid=threadIdx.x;
    int i=blockIdx.x*blockDim.x*2+threadIdx.x;
    float sum=0;
    if(i<N) sum+=in[i];
    if(i+blockDim.x<N) sum+=in[i+blockDim.x];
    s[tid]=sum; __syncthreads();
    for(int sSize=blockDim.x/2;sSize>0;sSize>>=1){
        if(tid<sSize) s[tid]+=s[tid+sSize];
        __syncthreads();
    }
    if(tid==0) atomicAdd(out,s[0]);
}

int main(){
    const int N=1024;
    float D[N]; for(int i=0;i<N;i++) D[i]=9*i*i;
    float *dD,*dSum; cudaMalloc(&dD,N*sizeof(float)); cudaMalloc(&dSum,sizeof(float));
    cudaMemcpy(dD,D,N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemset(dSum,0,sizeof(float));
    reduce<<<N/512,256,256*sizeof(float)>>>(dD,dSum,N);
    cudaDeviceSynchronize();
    float hostSum; cudaMemcpy(&hostSum,dSum,sizeof(float),cudaMemcpyDeviceToHost);
    printf("=== Part 6: Reduction ===\\nSum of D = %.2f\\n",hostSum);
    cudaFree(dD); cudaFree(dSum);
}


Writing part6_reduction.cu


In [16]:
!nvcc -arch=sm_75 part6_reduction.cu -o part6 && ./part6


=== Part 6: Reduction ===\nSum of D = 3216508416.00\n