In [1]:
!nvidia-smi


Tue Dec 30 16:26:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P0             27W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvidia-smi


Tue Dec 30 16:27:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P0             27W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [9]:
%%writefile coalesced.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void coalesced(float* a, float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 1 << 24;  // ~16 million
    size_t size = n * sizeof(float);

    float *h_a = new float[n];
    float *h_b = new float[n];

    for (int i = 0; i < n; i++) {
        h_a[i] = i;
        h_b[i] = 2 * i;
    }

    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    // Warm-up
    coalesced<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    for (int i = 0; i < 10; i++) {
        coalesced<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    std::cout << "Coalesced time (avg over 10 runs): "
              << ms / 10 << " ms\n";

    return 0;
}


Overwriting coalesced.cu


In [10]:
!nvcc coalesced.cu -o coalesced
!./coalesced


Coalesced time (avg over 10 runs): 0.365443 ms


In [11]:
%%writefile non_coalesced.cu
#include <iostream>
#include <cuda_runtime.h>

#define STRIDE 32

__global__ void non_coalesced(float* a, float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        int access = (idx * STRIDE) % n;  // scattered access
        c[access] = a[access] + b[access];
    }
}

int main() {
    int n = 1 << 24;
    size_t size = n * sizeof(float);

    float *h_a = new float[n];
    float *h_b = new float[n];

    for (int i = 0; i < n; i++) {
        h_a[i] = i;
        h_b[i] = 2 * i;
    }

    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    // Warm-up
    non_coalesced<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    for (int i = 0; i < 10; i++) {
        non_coalesced<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    std::cout << "Non-coalesced time (avg over 10 runs): "
              << ms / 10 << " ms\n";

    return 0;
}


Overwriting non_coalesced.cu


In [12]:
!nvcc non_coalesced.cu -o non_coalesced
!./non_coalesced


Non-coalesced time (avg over 10 runs): 4.7251 ms


In [7]:
%%writefile shared_memory.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void global_kernel(float* x, float* y, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx > 0 && idx < n-1)
        y[idx] = x[idx-1] + x[idx] + x[idx+1];
}

__global__ void shared_kernel(float* x, float* y, int n) {
    extern __shared__ float s[];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int t = threadIdx.x;

    if (idx < n) s[t] = x[idx];
    __syncthreads();

    if (t > 0 && t < blockDim.x-1 && idx < n-1)
        y[idx] = s[t-1] + s[t] + s[t+1];
}

int main() {
    int n = 1 << 24;
    size_t size = n * sizeof(float);

    float *h_x = new float[n];
    for (int i = 0; i < n; i++) h_x[i] = i;

    float *d_x, *d_y;
    cudaMalloc(&d_x, size);
    cudaMalloc(&d_y, size);
    cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    cudaEvent_t s, e;
    cudaEventCreate(&s);
    cudaEventCreate(&e);

    cudaEventRecord(s);
    global_kernel<<<gridSize, blockSize>>>(d_x, d_y, n);
    cudaEventRecord(e);
    cudaEventSynchronize(e);
    float t1;
    cudaEventElapsedTime(&t1, s, e);

    cudaEventRecord(s);
    shared_kernel<<<gridSize, blockSize, blockSize*sizeof(float)>>>(d_x, d_y, n);
    cudaEventRecord(e);
    cudaEventSynchronize(e);
    float t2;
    cudaEventElapsedTime(&t2, s, e);

    std::cout << "Global memory time: " << t1 << " ms\n";
    std::cout << "Shared memory time: " << t2 << " ms\n";
    std::cout << "Speedup: " << t1/t2 << "x\n";
}


Writing shared_memory.cu


In [8]:
!nvcc shared_memory.cu -o shared_memory
!./shared_memory


Global memory time: 16.8835 ms
Shared memory time: 0.28144 ms
Speedup: 59.9897x


In [1]:
!nvidia-smi


/bin/bash: line 1: nvidia-smi: command not found


In [2]:
import numpy as np
import time

H = 2048
W = 2048

image = np.random.rand(H, W).astype(np.float32)

kernel = np.array([[1, 1, 1],
                   [1, 1, 1],
                   [1, 1, 1]], dtype=np.float32)

output = np.zeros((H, W), dtype=np.float32)

start = time.perf_counter()

for i in range(1, H - 1):
    for j in range(1, W - 1):
        s = 0.0
        for ki in range(-1, 2):
            for kj in range(-1, 2):
                s += kernel[ki + 1, kj + 1] * image[i + ki, j + kj]
        output[i, j] = s

end = time.perf_counter()

print("CPU 2D convolution time:", end - start, "seconds")


CPU 2D convolution time: 30.864006089000043 seconds
