In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [4]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-s5fvvpf7
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-s5fvvpf7
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=bc7b6aef6a4bed4cde81c528d2a3f04c869c9613cd51ce9c00258b528b2df235
  Stored in directory: /tmp/pip-ephem-wheel-cache-h6tehc17/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [5]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [6]:
%%cu

#include <stdio.h>

// CUDA kernel for vector addition
__global__ void vectorAdd(int* a, int* b, int* c, int size) 
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() 
{
    int size = 100;  // Size of the vectors
    int* a, * b, * c;    // Host vectors
    int* dev_a, * dev_b, * dev_c;  // Device vectors

    // Allocate memory for host vectors
    a = (int*)malloc(size * sizeof(int));
    b = (int*)malloc(size * sizeof(int));
    c = (int*)malloc(size * sizeof(int));

    // Initialize host vectors
    for (int i = 0; i < size; i++) {
        a[i] = i;
        b[i] = i;
    }

    // Allocate memory on the device for device vectors
    cudaMalloc((void**)&dev_a, size * sizeof(int));
    cudaMalloc((void**)&dev_b, size * sizeof(int));
    cudaMalloc((void**)&dev_c, size * sizeof(int));

    // Copy host vectors to device
    cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel for vector addition
    int blockSize = 256; //threads
    int gridSize = (size + blockSize - 1) / blockSize;  //blocks
    vectorAdd<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, size);

    // Copy result from device to host
    cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Print result
    for (int i = 0; i < size; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    // Free device memory
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    // Free host memory
    free(a);
    free(b);
    free(c);

    return 0;
}

0 + 0 = 0
1 + 1 = 2
2 + 2 = 4
3 + 3 = 6
4 + 4 = 8
5 + 5 = 10
6 + 6 = 12
7 + 7 = 14
8 + 8 = 16
9 + 9 = 18
10 + 10 = 20
11 + 11 = 22
12 + 12 = 24
13 + 13 = 26
14 + 14 = 28
15 + 15 = 30
16 + 16 = 32
17 + 17 = 34
18 + 18 = 36
19 + 19 = 38
20 + 20 = 40
21 + 21 = 42
22 + 22 = 44
23 + 23 = 46
24 + 24 = 48
25 + 25 = 50
26 + 26 = 52
27 + 27 = 54
28 + 28 = 56
29 + 29 = 58
30 + 30 = 60
31 + 31 = 62
32 + 32 = 64
33 + 33 = 66
34 + 34 = 68
35 + 35 = 70
36 + 36 = 72
37 + 37 = 74
38 + 38 = 76
39 + 39 = 78
40 + 40 = 80
41 + 41 = 82
42 + 42 = 84
43 + 43 = 86
44 + 44 = 88
45 + 45 = 90
46 + 46 = 92
47 + 47 = 94
48 + 48 = 96
49 + 49 = 98
50 + 50 = 100
51 + 51 = 102
52 + 52 = 104
53 + 53 = 106
54 + 54 = 108
55 + 55 = 110
56 + 56 = 112
57 + 57 = 114
58 + 58 = 116
59 + 59 = 118
60 + 60 = 120
61 + 61 = 122
62 + 62 = 124
63 + 63 = 126
64 + 64 = 128
65 + 65 = 130
66 + 66 = 132
67 + 67 = 134
68 + 68 = 136
69 + 69 = 138
70 + 70 = 140
71 + 71 = 142
72 + 72 = 144
73 + 73 = 146
74 + 74 = 148
75 + 75 = 150
76 + 76 = 1

In [20]:
%%cu
#include <stdio.h>

__global__ void matrixMultiply(int *A, int *B, int *C, int N) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  if (row < N && col < N) {
    int sum = 0;
    for (int k = 0; k < N; ++k) {
      sum += A[row * N + k] * B[k * N + col];
    }
    C[row * N + col] = sum;
  }
}

int main() {
  int N = 2;
  int size = N * N * sizeof(int);
  int *A, *B, *C;
  int *dev_A, *dev_B, *dev_C;

  // Allocate memory for matrices A, B, and C on the host
  A = (int *)malloc(size);
  B = (int *)malloc(size);
  C = (int *)malloc(size);

  // Initialize matrices A and B
for (int i = 0; i < N; ++i) {
  for (int j = 0; j < N; ++j) {
    A[i * N + j] = i + j;
    B[i * N + j] = i * N + j;  
  }
}

  printf("initial matrix A:\n");
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      printf("%d ", A[i * N + j]);
    }
    printf("\n");
  }

  printf("initial matrix B:\n");
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      printf("%d ", B[i * N + j]);
    }
    printf("\n");
  }

  // Allocate memory for matrices A, B, and C on the device
  cudaMalloc(&dev_A, size);
  cudaMalloc(&dev_B, size);
  cudaMalloc(&dev_C, size);

  // Copy matrices A and B from host to device
  cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

  // Define grid and block dimensions
  dim3 dimBlock(16, 16);
  dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (N + dimBlock.y - 1) / dimBlock.y);

  // Launch the matrix multiplication kernel
  matrixMultiply<<<dimGrid, dimBlock>>>(dev_A, dev_B, dev_C, N);

  // Copy the result matrix C from device to host
  cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

  // Print the result matrix
  printf("Result matrix C:\n");
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      printf("%d ", C[i * N + j]);
    }
    printf("\n");
  }

  // Free device memory
  cudaFree(dev_A);
  cudaFree(dev_B);
  cudaFree(dev_C);

  // Free host memory
  free(A);
  free(B);
  free(C);

  return 0;
}


initial matrix A:
0 1 
1 2 
initial matrix B:
0 1 
2 3 
Result matrix C:
2 3 
4 7 



In [25]:
# MATRIX MULTIPLICATION

%%cu

#include <stdio.h>

// CUDA kernel for matrix multiplication
__global__ void matrixMul(int* a, int* b, int* c, int rowsA, int colsA, int colsB) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int sum = 0;
    if (row < rowsA && col < colsB) {
        for (int i = 0; i < colsA; i++) {
            sum += a[row * colsA + i] * b[i * colsB + col];
        }
        c[row * colsB + col] = sum;
    }
}

int main() {
    int rowsA = 2;  // Rows of matrix A
    int colsA = 2;  // Columns of matrix A
    int rowsB = colsA; // Rows of matrix B
    int colsB = 2;  // Columns of matrix B

    int* a, * b, * c;  // Host matrices
    int* dev_a, * dev_b, * dev_c;  // Device matrices

    // Allocate memory for host matrices
    a = (int*)malloc(rowsA * colsA * sizeof(int));
    b = (int*)malloc(rowsB * colsB * sizeof(int));
    c = (int*)malloc(rowsA * colsB * sizeof(int));

    // Initialize host matrices
    for (int i = 0; i < rowsA * colsA; i++) {
        a[i] = i;
    }
    for (int i = 0; i < rowsB * colsB; i++) {
        b[i] = 2 * i;
    }

    // Allocate memory on the device for device matrices
    cudaMalloc((void**)&dev_a, rowsA * colsA * sizeof(int));
    cudaMalloc((void**)&dev_b, rowsB * colsB * sizeof(int));
    cudaMalloc((void**)&dev_c, rowsA * colsB * sizeof(int));

    // Copy host matrices to device
    cudaMemcpy(dev_a, a, rowsA * colsA * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, rowsB * colsB * sizeof(int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockSize(16, 16);
    dim3 gridSize((colsB + blockSize.x - 1) / blockSize.x, (rowsA + blockSize.y - 1) / blockSize.y);

    // Launch kernel for matrix multiplication
    matrixMul<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, rowsA, colsA, colsB);

    // Copy result from device to host
    cudaMemcpy(c, dev_c, rowsA * colsB * sizeof(int), cudaMemcpyDeviceToHost);

    // Print result
    printf("Result:\n");
    for (int i = 0; i < rowsA; i++) {
        for (int j = 0; j < colsB; j++) {
            printf("%d ", c[i * colsB + j]);
        }
        printf("\n");
    }

    // Free device memory
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    // Free host memory
    free(a);
    free(b);
    free(c);

    return 0;
}

Result:
4 6 
12 22 

