<a href="https://colab.research.google.com/github/PuppyQ08/CUDA_Exercise_Tutorial/blob/main/jupyter_Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Hi This is QQY. Here is the CUDA testing labs by using [nvcc4jupyter](https://github.com/andreinechaev/nvcc4jupyter). The first test is for physical padding and logic padding benchmark comparison.

In [1]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpbatmicqz".


This is the test to check if nvcc4jupyter works at current case.

In [5]:
%%cuda -c "--gpu-architecture sm_75
#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

const int DSIZE = 4096;
const int block_size = 256;

// vector add kernel: C = A + B
__global__ void vadd(const float *A, const float *B, float *C, int ds){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < ds) {
        C[idx] = A[idx] + B[idx];
    }
}

int main(){
    float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;

    // allocate space for vectors in host memory
    h_A = new float[DSIZE];
    h_B = new float[DSIZE];
    h_C = new float[DSIZE];

    // initialize vectors in host memory to random values (except for the
    // result vector whose values do not matter as they will be overwritten)
    for (int i = 0; i < DSIZE; i++) {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    // allocate space for vectors in device memory
    cudaMalloc(&d_A, DSIZE*sizeof(float));
    cudaMalloc(&d_B, DSIZE*sizeof(float));
    cudaMalloc(&d_C, DSIZE*sizeof(float));
    cudaCheckErrors("cudaMalloc failure"); // error checking

    // copy vectors A and B from host to device:
    cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy H2D failure");

    // launch the vector adding kernel
    vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
    cudaCheckErrors("kernel launch failure");

    // wait for the kernel to finish execution
    cudaDeviceSynchronize();
    cudaCheckErrors("kernel execution failure");

    cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy D2H failure");

    printf("A[0] = %f\n", h_A[0]);
    printf("B[0] = %f\n", h_B[0]);
    printf("C[0] = %f\n", h_C[0]);
    return 0;
}


A[0] = 0.840188
B[0] = 0.394383
C[0] = 1.234571



This the benchmark test to compare manual padding and logic padding.

In [3]:
%%cuda -c "--gpu-architecture sm_75"
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <iomanip>

// Helper to check for CUDA errors
#define CHECK_CUDA(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \
        exit(1); \
    } \
}

int main() {
    const int M = 8192; // Large enough to see a difference
    const int K = 8192;
    const int PADDED_DIM = 8224; // Next multiple of 32 for alignment

    // Initialize source data
    std::vector<float> h_A_orig(M * K, 1.0f);

    // Timing variables
    float time_manual, time_pitched;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // --- CASE 1: MANUAL CPU PADDING ---
    CHECK_CUDA(cudaEventRecord(start));

    // 1. CPU Work: Allocate and Zero
    float* h_A_padded = (float*)calloc(PADDED_DIM * PADDED_DIM, sizeof(float));
    // 2. CPU Work: Copy rows
    for (int i = 0; i < M; i++) {
        memcpy(&h_A_padded[i * PADDED_DIM], &h_A_orig[i * K], K * sizeof(float));
    }
    // 3. GPU Work: Memcpy
    float* d_A_manual;
    CHECK_CUDA(cudaMalloc(&d_A_manual, PADDED_DIM * PADDED_DIM * sizeof(float)));
    CHECK_CUDA(cudaMemcpy(d_A_manual, h_A_padded, PADDED_DIM * PADDED_DIM * sizeof(float), cudaMemcpyHostToDevice));

    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    CHECK_CUDA(cudaEventElapsedTime(&time_manual, start, stop));

    // --- CASE 2: PITCHED GPU ALLOCATION ---
    CHECK_CUDA(cudaEventRecord(start));

    // 1. GPU Work: Allocate Pitched
    float* d_A_pitched;
    size_t d_pitch;
    CHECK_CUDA(cudaMallocPitch(&d_A_pitched, &d_pitch, K * sizeof(float), M));
    // 2. GPU Work: Memcpy2D (Direct from original host pointer)
    CHECK_CUDA(cudaMemcpy2D(d_A_pitched, d_pitch, h_A_orig.data(), K * sizeof(float), K * sizeof(float), M, cudaMemcpyHostToDevice));

    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    CHECK_CUDA(cudaEventElapsedTime(&time_pitched, start, stop));

    // --- RESULTS ---
    std::cout << std::fixed << std::setprecision(3);
    std::cout << "Matrix Size: " << M << "x" << K << std::endl;
    std::cout << "------------------------------------------" << std::endl;
    std::cout << "Case 1 (Manual CPU Padding): " << time_manual << " ms" << std::endl;
    std::cout << "Case 2 (cudaMallocPitch):    " << time_pitched << " ms" << std::endl;
    std::cout << "------------------------------------------" << std::endl;
    std::cout << "Pitched is " << time_manual / time_pitched << "x faster." << std::endl;

    // Cleanup
    free(h_A_padded);
    CHECK_CUDA(cudaFree(d_A_manual));
    CHECK_CUDA(cudaFree(d_A_pitched));
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

Matrix Size: 8192x8192
------------------------------------------
Case 1 (Manual CPU Padding): 199.938 ms
Case 2 (cudaMallocPitch):    57.003 ms
------------------------------------------
Pitched is 3.508x faster.

