In [1]:
!nvidia-smi


Tue Nov 11 18:08:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   62C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [3]:
%%writefile matrix_mul_naive.cu
#include <iostream>
#include <cuda_runtime.h>
#define N 1024


__global__ void matrixMulNaive(const float* A, const float* B, float* C, int n) {
    int row=blockIdx.y*blockDim.y +threadIdx.y;
    int col=blockIdx.x*blockDim.x +threadIdx.x;
    if (row <n && col<n) {
        float val=0.0f;
        for (int k=0; k< n; ++k)
            val += A[row*n + k]*B[k * n + col];
        C[row*n + col] = val;
    }
}



int main() {
    int size = N * N * sizeof(float);
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C = (float*)malloc(size);
    for (int i = 0; i < N * N; i++) {
        h_A[i] = static_cast<float>(rand()) / RAND_MAX;
        h_B[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    dim3 threads(16, 16);
    dim3 blocks((N + threads.x - 1) / threads.x, (N + threads.y - 1) / threads.y);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // run kernel
    cudaEventRecord(start);
    matrixMulNaive<<<blocks, threads>>>(d_A, d_B, d_C, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms = 0.0f;
    cudaEventElapsedTime(&ms, start, stop);
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    double flops = 2.0 * N * N * N;
    double gflops = flops / (ms * 1e6);
    std::cout << "Native CUDA Matrix Multiplication:\n";
    std::cout << ms << " ms,  Performance: " << gflops << " GFLOPS\n";

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    return 0;
}

###### COMPILE ###########
//!nvcc -O3 -arch=sm_75 matrix_mul_naive.cu -o matrix_mul_naive
//!./matrix_mul_naive

Writing matrix_mul_naive.cu


In [4]:
!nvcc -O3 -arch=sm_75 matrix_mul_naive.cu -o matrix_mul_naive
!./matrix_mul_naive

[01m[0m[01mmatrix_mul_naive.cu(65)[0m: [01;31merror[0m: unrecognized preprocessing directive
  ###### COMPILE ###########
   ^

1 error detected in the compilation of "matrix_mul_naive.cu".
/bin/bash: line 1: ./matrix_mul_naive: No such file or directory


In [5]:
%%writefile matrix_mul_tiled_prof.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define TILE 16

__global__ void matmul_naive(const float *A,const float *B,float *C,int M,int N,int K){
 int row=blockIdx.y*blockDim.y+threadIdx.y;
 int col=blockIdx.x*blockDim.x+threadIdx.x;
 if(row<M && col<K){
   float val=0;
   for(int i=0;i<N;i++)
     val+=A[row*N+i]*B[i*K+col];
   C[row*K+col]=val;
 }
}

__global__ void matmul_tiled(const float *A,const float *B,float *C,int M,int N,int K){
 __shared__ float As[TILE][TILE];
 __shared__ float Bs[TILE][TILE];
 int row=blockIdx.y*TILE+threadIdx.y;
 int col=blockIdx.x*TILE+threadIdx.x;
 float val=0.0f;
 for(int t=0;t<(N+TILE-1)/TILE;t++){
   if(row<M && t*TILE+threadIdx.x<N)
     As[threadIdx.y][threadIdx.x]=A[row*N+t*TILE+threadIdx.x];
   else As[threadIdx.y][threadIdx.x]=0.0f;

   if(col<K && t*TILE+threadIdx.y<N)
     Bs[threadIdx.y][threadIdx.x]=B[(t*TILE+threadIdx.y)*K+col];
   else Bs[threadIdx.y][threadIdx.x]=0.0f;
   __syncthreads();

   for(int i=0;i<TILE;i++)
     val+=As[threadIdx.y][i]*Bs[i][threadIdx.x];
   __syncthreads();
 }
 if(row<M && col<K) C[row*K+col]=val;
}

void fill(float *a,int m,int n){
  for(int i=0;i<m*n;i++) a[i]=(float)(rand()%10);
}

int main(){
 int M=1024,N=1024,K=1024;
 size_t sA=M*N*sizeof(float), sB=N*K*sizeof(float), sC=M*K*sizeof(float);
 float *hA=(float*)malloc(sA);
 float *hB=(float*)malloc(sB);
 float *hC=(float*)malloc(sC);
 fill(hA,M,N); fill(hB,N,K);
 float *dA,*dB,*dC;
 cudaMalloc(&dA,sA); cudaMalloc(&dB,sB); cudaMalloc(&dC,sC);
 cudaMemcpy(dA,hA,sA,cudaMemcpyHostToDevice);
 cudaMemcpy(dB,hB,sB,cudaMemcpyHostToDevice);
 dim3 block(TILE,TILE);
 dim3 grid((K+TILE-1)/TILE,(M+TILE-1)/TILE);

 cudaEvent_t start,stop;
 cudaEventCreate(&start); cudaEventCreate(&stop);

 cudaEventRecord(start);
 matmul_naive<<<grid,block>>>(dA,dB,dC,M,N,K);
 cudaEventRecord(stop);
 cudaEventSynchronize(stop);
 float ms1;
 cudaEventElapsedTime(&ms1,start,stop);
 double gflops1=2.0*M*N*K/(ms1/1000.0)/1e9;
 printf("Naive: %.4f ms  %.2f GFLOPS\n",ms1,gflops1);

 cudaMemset(dC,0,sC);
 cudaEventRecord(start);
 matmul_tiled<<<grid,block>>>(dA,dB,dC,M,N,K);
 cudaEventRecord(stop);
 cudaEventSynchronize(stop);
 float ms2;
 cudaEventElapsedTime(&ms2,start,stop);
 double gflops2=2.0*M*N*K/(ms2/1000.0)/1e9;
 printf("Tiled: %.4f ms  %.2f GFLOPS\n",ms2,gflops2);

 cudaFree(dA); cudaFree(dB); cudaFree(dC);
 free(hA); free(hB); free(hC);
 return 0;
}



//!nvcc -O3 -arch=sm_75 matrix_mul_tiled_prof.cu -o matrix_mul_tiled_prof
//!nvprof ./matrix_mul_tiled_prof

Writing matrix_mul_tiled_prof.cu


In [6]:
!nvcc -O3 -arch=sm_75 matrix_mul_tiled_prof.cu -o matrix_mul_tiled_prof
!nvprof ./matrix_mul_tiled_prof

==674== NVPROF is profiling process 674, command: ./matrix_mul_tiled_prof
Naive: 9.3643 ms  229.33 GFLOPS
Tiled: 5.8026 ms  370.09 GFLOPS
==674== Profiling application: ./matrix_mul_tiled_prof
==674== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   55.51%  9.1975ms         1  9.1975ms  9.1975ms  9.1975ms  matmul_naive(float const *, float const *, float*, int, int, int)
                   34.98%  5.7964ms         1  5.7964ms  5.7964ms  5.7964ms  matmul_tiled(float const *, float const *, float*, int, int, int)
                    9.42%  1.5607ms         2  780.33us  756.14us  804.52us  [CUDA memcpy HtoD]
                    0.09%  14.399us         1  14.399us  14.399us  14.399us  [CUDA memset]
      API calls:   84.76%  106.30ms         3  35.433ms  66.524us  106.16ms  cudaMalloc
                   11.96%  15.003ms         2  7.5015ms  5.8008ms  9.2022ms  cudaEventSynchronize
                    1.88%  2.3518ms     