In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [None]:

%%cuda --compiler-args "-gencode arch=compute_75,code=sm_75"
#include <cstdio>
#include <cuda_runtime.h>



// ---------------------------
// CUDA KERNEL: Matrix Addition
// ---------------------------
__global__ void matAdd(float *A, float *B, float *C, int m, int n)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;



    if (row < m && col < n)
        C[row * n + col] = A[row * n + col] + B[row * n + col];



}

// -------------------------------
// CUDA KERNEL: Matrix Multiplication
// -------------------------------
__global__ void matMul(float *A, float *B, float *C, int m, int k, int n)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < m && col < n)
    {
        float sum = 0.0;
        for (int i = 0; i < k; i++)
            sum += A[row * k + i] * B[i * n + col];

        C[row * n + col] = sum;
    }
}

// ---------------------------
// MAIN FUNCTION
// ---------------------------
int main()
{
    int m = 3, k = 3, n = 3;  // A = m×k, B = k×n

    // Host memory
    float A[m*k], B[k*n], C_add[m*n], C_mul[m*n];

    // ---------------------------
    // Initialize matrices in FOR LOOP
    // ---------------------------
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++) {
            A[i * k + j] = i + j;       // Example init
        }
    }

    for (int i = 0; i < k; i++) {
        for (int j = 0; j < n; j++) {
            B[i * n + j] = i * 2 + j;   // Example init
        }
    }

    // Device memory
    float *dA, *dB, *dCadd, *dCmul;
    cudaMalloc(&dA, m * k * sizeof(float));
    cudaMalloc(&dB, k * n * sizeof(float));
    cudaMalloc(&dCadd, m * n * sizeof(float));
    cudaMalloc(&dCmul, m * n * sizeof(float));

    cudaMemcpy(dA, A, m * k * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dB, B, k * n * sizeof(float), cudaMemcpyHostToDevice);

    // Launch config
    dim3 block(16, 16);
    dim3 grid((n + 15) / 16, (m + 15) / 16);

    // Matrix Add
    matAdd<<<grid, block>>>(dA, dB, dCadd, m, n);
    cudaDeviceSynchronize();

    // Matrix Multiply
    matMul<<<grid, block>>>(dA, dB, dCmul, m, k, n);
    cudaDeviceSynchronize();

    cudaMemcpy(C_add, dCadd, m * n * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(C_mul, dCmul, m * n * sizeof(float), cudaMemcpyDeviceToHost);

    // ---------------------------
    // Print Results
    // ---------------------------
    printf("\nMatrix A:\n");
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < k; j++)
            printf("%5.1f ", A[i * k + j]);
        printf("\n");
    }

    printf("\nMatrix B:\n");
    for (int i = 0; i < k; i++) {
        for (int j = 0; j < n; j++)
            printf("%5.1f ", B[i * n + j]);
        printf("\n");
    }

    printf("\nMatrix Addition (A + B):\n");
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++)
            printf("%5.1f ", C_add[i * n + j]);
        printf("\n");
    }

    printf("\nMatrix Multiplication (A × B):\n");
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++)
            printf("%6.1f ", C_mul[i * n + j]);
        printf("\n");
    }

    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dCadd);
    cudaFree(dCmul);

    return 0;
}



Matrix A:
  0.0   1.0   2.0 
  1.0   2.0   3.0 
  2.0   3.0   4.0 

Matrix B:
  0.0   1.0   2.0 
  2.0   3.0   4.0 
  4.0   5.0   6.0 

Matrix Addition (A + B):
  0.0   2.0   4.0 
  3.0   5.0   7.0 
  6.0   8.0  10.0 

Matrix Multiplication (A × B):
  10.0   13.0   16.0 
  16.0   22.0   28.0 
  22.0   31.0   40.0 

