# Check for prequisites

In [171]:
# check for cuda toolkit
!ls /usr/local/

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [172]:
# check for cuda compiler driver
!which nvcc

'which' is not recognized as an internal or external command,
operable program or batch file.


In [173]:
# check for CUSA device
!nvidia-smi

Wed Apr 17 12:01:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.70                 Driver Version: 537.70       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce MX550         WDDM  | 00000000:02:00.0 Off |                  N/A |
| N/A   72C    P0              10W /  30W |    108MiB /  2048MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [174]:
# check for cpp compiler
!which g++

'which' is not recognized as an internal or external command,
operable program or batch file.


# Creating CUDA file

Device header file

In [175]:
%%writefile inverse-matrix.cuh

#include <cuda_runtime.h>

__global__ void calculateInverse(double* matrix_pointer, int width, int height, int currRow);

__global__ void step1(double* matrix_pointer, int width, int height, int currRow);

__global__ void step2(double* matrix_pointer, int width, int height, int currRow);

Overwriting inverse-matrix.cuh


Device source code

In [176]:
%%writefile inverse-matrix.cu

#include <stdio.h>
#include "inverse-matrix.cuh"

using namespace std;

__global__ void calculateInverse(double* matrix_pointer, int width, int height, int currRow)
{
  // step1(matrix_pointer, width, height, currRow);
  __syncthreads();
  // step2(matrix_pointer, width, height, currRow);
  __syncthreads();
}

__global__ void step1(double* matrix_pointer, int width, int height, int currRow) {
  __shared__ double diagonal;
  diagonal = matrix_pointer[width * currRow + currRow];
  __syncthreads();

  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < width) {
    matrix_pointer[currRow * width + idx] =  matrix_pointer[currRow * width + idx] / diagonal;
  }
  __syncthreads();
}

__global__ void step2(double* matrix_pointer, int width, int height, int currRow) {
  int col_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int row_idx = blockIdx.y * blockDim.y + threadIdx.y;
  
  if (row_idx != currRow && row_idx < height && col_idx < width && col_idx >= currRow) {
      int pivotAdjIdx = row_idx * width + currRow;
      double multiplier = matrix_pointer[pivotAdjIdx];
    
      int idx = row_idx * width + col_idx;
      int pivotIdx = currRow * width + col_idx;
      
      double before = matrix_pointer[idx]; 
      double pivotElmt = matrix_pointer[pivotIdx];

      matrix_pointer[idx] = before - multiplier * pivotElmt;
      
      double after = matrix_pointer[idx]; 

      // printf("iteration: %d, Col: %d, Row: %d, Multiplier: %.15f, pivotElmt %.15f, before %.15f, after %.15f, height %d width %d\n", currRow, col_idx, row_idx, multiplier, pivotElmt, before, after, height, width);
  }
  __syncthreads();
}

Overwriting inverse-matrix.cu


host source code

In [183]:
%%writefile cuda.cu

#include <stdio.h>
#include <iostream>
#include "inverse-matrix.cuh"

using namespace std;

// Display a 2d matrix
void printMatrix(double *matrix, int count_width, int count_row, int res)
{
  for (int i = 0; i < count_row; i++)
  {
    for (int j = 0; j < count_width; j++)
    {
      if (res)
      {
        if (j > (count_width / 2) - 1)
        {
          cout << *(matrix + j + i * count_width) << " ";
        }
      }
      else
      {
        cout << *(matrix + j + i * count_width) << " ";
        if (j == (count_width / 2) - 1)
        {
          cout << "|";
        }
      }
    }
    cout << endl;
  }
}

int main(int argc,char **argv)
{
  double *matrix;
  int mat_size;

  cin >> mat_size;
  int count_x = mat_size;
  int count_y = mat_size * 2;
  matrix = (double *)malloc(count_x * count_y * sizeof(double));

  for (int i = 0; i < mat_size; i++)
  {
    for (int j = 0; j < mat_size; j++)
    {
      cin >> *(matrix + i * count_y + j);
    }
  }

  for (int i = 0; i < mat_size; i++)
  {
    for (int j = mat_size; j < (mat_size) * 2; j++)
    {
      if (i + mat_size == j)
      {
        *(matrix + i * count_y + j) = 1;
      }
      else
      {
        *(matrix + i * count_y + j) = 0;
      }
    }
  }

  dim3 blockDim(16, 16);
  dim3 gridDim((count_y + blockDim.x - 1) / blockDim.x, (count_x + blockDim.y - 1) / blockDim.y);

  double *device_matrix_pointer;

  cudaMalloc((void**)&device_matrix_pointer, count_x * count_y * sizeof(double));
  cudaMemcpy(device_matrix_pointer, matrix,  count_x * count_y * sizeof(double), cudaMemcpyHostToDevice);

  printf("grid.x %d grid.y %d grid.z %d\n",gridDim.x,gridDim.y,gridDim.z);
  printf("block.x %d block.y %d block.z %d\n",blockDim.x,blockDim.y,blockDim.z);


  for (int i = 0; i < count_x; i++) {
    cudaDeviceSynchronize();
    step1<<<blockDim,gridDim>>>(device_matrix_pointer, count_y, count_x, i);
    cudaDeviceSynchronize();
    step2<<<blockDim,gridDim>>>(device_matrix_pointer, count_y, count_x, i);
    cudaDeviceSynchronize();    
    // cudaDeviceSynchronize();
    // calculateInverse<<<blockDim,gridDim>>>(device_matrix_pointer, count_y, count_x, i);
    // cudaMemcpy(matrix, device_matrix_pointer, count_x * count_y * sizeof(double), cudaMemcpyDeviceToHost);
    // printMatrix(matrix, count_y, count_x, 0);
  }

  cudaMemcpy(matrix, device_matrix_pointer, count_x * count_y * sizeof(double), cudaMemcpyDeviceToHost);
  printMatrix(matrix, count_y, count_x, 1);

  cudaFree(device_matrix_pointer);
  cudaDeviceReset();
  return 0;
}

Overwriting cuda.cu


In [178]:
# Compile the code.
!nvcc inverse-matrix.cu cuda.cu -o cuda.exe

In [185]:
# !!! Make sure the test file is uploaded !!!
!cuda.exe < ./64.txt > out.txt