In [1]:
import subprocess
import pandas as pd

In [3]:
!nvidia-smi

Wed Jan 28 19:07:21 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Part 1: Matrix Multiplication on CPU

In [4]:
def run_time_comparasion(n_list, command = './matrix_cpu'):
   time_list = []


   for n in n_list:
       run_cmd = subprocess.run([command, str(n)], capture_output=True, text=True)
       output = run_cmd.stdout
       print(output)
       time_str = output.split(':')[-1].strip()
       time_list.append(time_str)


   return time_list



In [5]:
%%writefile matrix_cpu.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void matrixMultiplyCPU(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
            for (int k = 0; k < N; k++) {
                sum += A[i * N + k] * B[k * N + j];
            }
        C[i * N + j] = sum;
        }
    }
}

int main(int argc, char **argv) {
 int N = (argc > 1) ? atoi(argv[1]) : 1024; // allow matrix size as input
 size_t size = N * N * sizeof(float);

 float *A = (float *)malloc(size);
 float *B = (float *)malloc(size);
 float *C = (float *)malloc(size);

 for (int i = 0; i < N * N; i++) {
        A[i] = rand() % 100 / 100.0f;
        B[i] = rand() % 100 / 100.0f;
    }

 clock_t start = clock();
 matrixMultiplyCPU(A, B, C, N);
 clock_t end = clock();
 double elapsed = (double)(end - start) / CLOCKS_PER_SEC;

 printf("CPU execution time (N=%d): %.4f sec\n", N, elapsed);
 free(A); free(B); free(C);
 return 0;
}

Overwriting matrix_cpu.c


In [6]:
!gcc matrix_cpu.c -o matrix_cpu -O2

In [7]:
n_list = [2**k for k in range(6, 12)]
a = run_time_comparasion(n_list, command = './matrix_cpu')


CPU execution time (N=64): 0.0003 sec

CPU execution time (N=128): 0.0025 sec

CPU execution time (N=256): 0.0222 sec

CPU execution time (N=512): 0.3369 sec

CPU execution time (N=1024): 3.3072 sec

CPU execution time (N=2048): 78.9370 sec



In [8]:
df = pd.DataFrame({
    'Implementation': ['CPU (C)'],
    'N=64': [str(a[0])],
    'N=128': [str(a[1])],
    'N=256': [str(a[2])],
    'N=512': [str(a[3])],
    'N=1024': [str(a[4])],
    'N=2048': [str(a[5])]
})
df

Unnamed: 0,Implementation,N=64,N=128,N=256,N=512,N=1024,N=2048
0,CPU (C),0.0003 sec,0.0025 sec,0.0222 sec,0.3369 sec,3.3072 sec,78.9370 sec


## Step 2.1: Naïve CUDA Kernel

In [9]:
%%writefile matrix_naive_gpu.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void matrixMultiplyGPU(const float *A, const float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;  // y -> row
    int col = blockIdx.x * blockDim.x + threadIdx.x;  // x -> col

    if (row < N && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

static void checkCuda(cudaError_t err, const char *msg) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error (%s): %s\n", msg, cudaGetErrorString(err));
        exit(1);
    }
}

int main(int argc, char **argv) {
    int N = (argc > 1) ? atoi(argv[1]) : 1024;
    size_t size = (size_t)N * N * sizeof(float);

    // Host memory
    float *A = (float*)malloc(size);
    float *B = (float*)malloc(size);
    float *C = (float*)malloc(size);
    if (!A || !B || !C) {
        fprintf(stderr, "Host malloc failed\n");
        return 1;
    }
    for (int i = 0; i < N * N; i++) {
        A[i] = (rand() % 100) / 100.0f;
        B[i] = (rand() % 100) / 100.0f;
    }

    float *dA, *dB, *dC;
    checkCuda(cudaMalloc((void**)&dA, size), "cudaMalloc dA");
    checkCuda(cudaMalloc((void**)&dB, size), "cudaMalloc dB");
    checkCuda(cudaMalloc((void**)&dC, size), "cudaMalloc dC");

    checkCuda(cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice), "Memcpy A H2D");
    checkCuda(cudaMemcpy(dB, B, size, cudaMemcpyHostToDevice), "Memcpy B H2D");

    dim3 block(16, 16);
    dim3 grid((N + block.x - 1) / block.x, (N + block.y - 1) / block.y);


    cudaEvent_t start, stop;
    checkCuda(cudaEventCreate(&start), "Event create start");
    checkCuda(cudaEventCreate(&stop),  "Event create stop");

    checkCuda(cudaEventRecord(start), "Event record start");
    matrixMultiplyGPU<<<grid, block>>>(dA, dB, dC, N);
    checkCuda(cudaGetLastError(), "Kernel launch");
    checkCuda(cudaEventRecord(stop), "Event record stop");
    checkCuda(cudaEventSynchronize(stop), "Event sync stop");

    // Cuda Event record always returns time in mili-seconds
    float ms = 0.0f;
    checkCuda(cudaEventElapsedTime(&ms, start, stop), "Event elapsed time");

    printf("Naive CUDA kernel time (N=%d): %.4f ms\n", N, ms);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(dA); cudaFree(dB); cudaFree(dC);
    free(A); free(B); free(C);
    return 0;
}




Overwriting matrix_naive_gpu.cu


## Part 3: Running CUDA on Google Cloud

In [10]:
!nvcc -O2 -gencode arch=compute_75,code=sm_75 matrix_naive_gpu.cu -o matrix_naive_gpu


In [11]:
n_list = [2**k for k in range(6, 12)]
b = run_time_comparasion(n_list, command = './matrix_naive_gpu')
b

Naive CUDA kernel time (N=64): 0.1086 ms

Naive CUDA kernel time (N=128): 0.1229 ms

Naive CUDA kernel time (N=256): 0.2575 ms

Naive CUDA kernel time (N=512): 1.1962 ms

Naive CUDA kernel time (N=1024): 9.2317 ms

Naive CUDA kernel time (N=2048): 74.9345 ms



['0.1086 ms', '0.1229 ms', '0.2575 ms', '1.1962 ms', '9.2317 ms', '74.9345 ms']

## Performance Matrix

In [12]:
df_cpu = pd.DataFrame({
    'Implementation': ['CPU (C)'],
    'N=64': [str(a[0])],
    'N=128': [str(a[1])],
    'N=256': [str(a[2])],
    'N=512': [str(a[3])],
    'N=1024': [str(a[4])],
    'N=2048': [str(a[5])]
})


In [13]:
df_gpu = pd.DataFrame({
    'Implementation': ['Naïve CUDA'],
    'N=64': [str(b[0])],
    'N=128': [str(b[1])],
    'N=256': [str(b[2])],
    'N=512': [str(b[3])],
    'N=1024': [str(b[4])],
    'N=2048': [str(b[5])]
})
df_gpu

Unnamed: 0,Implementation,N=64,N=128,N=256,N=512,N=1024,N=2048
0,Naïve CUDA,0.1086 ms,0.1229 ms,0.2575 ms,1.1962 ms,9.2317 ms,74.9345 ms


In [14]:

df = pd.concat([df_cpu, df_gpu], ignore_index=True)
df

Unnamed: 0,Implementation,N=64,N=128,N=256,N=512,N=1024,N=2048
0,CPU (C),0.0003 sec,0.0025 sec,0.0222 sec,0.3369 sec,3.3072 sec,78.9370 sec
1,Naïve CUDA,0.1086 ms,0.1229 ms,0.2575 ms,1.1962 ms,9.2317 ms,74.9345 ms
