### matrix_transpose_naive
- **Description**: A simple implementation of matrix transposition without any optimization techniques.

### matrix_transpose_shared_uncoalesced
- **Description**: A matrix transposition method using shared memory and uncoalesced memory accesses in `output`.

### matrix_transpose_shared_uncoalesced_no_conflict
- **Description**: A matrix transposition method using shared memory and uncoalesced memory accesses, which avoids bank conflicts.

### matrix_transpose_shared_coalesced
- **Description**: A matrix transposition method using shared memory and coalesced memory accesses.

### matrix_transpose_shared_coalesced_no_conflict (the fastest one)
- **Description**: A matrix transposition method using shared memory and coalesced memory accesses, which avoids bank conflicts.

### avoid bank conflict
| bank 0 | bank 1 | bank 2 | bank 3 | ... | bank 30 | bank 31 |
| :---------: | :----------: | :----------: | :---------: | :----------: | :----------: | :----------: |
| 0 | 1 | 2 | 3 | ... | 30 | 31 |
| NULL | 32 | 33 | 34 | ... | 61 | 62 |
| 63 | NULL | 64 | 65 | ... | 92 | 94 |
| 94 | 95 | NULL | 96 | ... | 123 | 124 |  

when reading data in `sharedMemory`, the sequence will be 0, 32, 64 with no bank conflict.

In [None]:
%%writefile matrix_transpose.cu
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>

#define TYPE int
#define N 640
#define M 1280
#define BLOCK_SIZE 32

// 1 2 3 4
// 5 6 7 8

// 1 5
// 2 6
// 3 7
// 4 8
__global__ void warm_up() {
    int indexX = threadIdx.x + blockIdx.x * blockDim.x;
	  int indexY = threadIdx.y + blockIdx.y * blockDim.y;
    if(indexX < N && indexY < M)
    {
        float a = 0.0f;
        float b = 1.0f;
        float c = a + b;
    }
}


template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void matrix_transpose_naive(T *input, T *output) {

	int indexX = threadIdx.x + blockIdx.x * blockDim.x;
	int indexY = threadIdx.y + blockIdx.y * blockDim.y;

  if (indexX < N && indexY < M)
  {
      int index = indexY * N + indexX;
      int transposedIndex = indexX * M + indexY;

      // this has discoalesced global memory store
      output[transposedIndex] = input[index];

      // this has discoalesced global memore load
      // output[index] = input[transposedIndex];
      // printf("%d, %d, %d, %d, %d \n", indexX, indexY, index, transposedIndex, input[index]);
  }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void matrix_transpose_shared_uncoalesed(T *input, T *output) {

	__shared__ T sharedMemory [BLOCK_SIZE] [BLOCK_SIZE];

	// global index
	int indexX = threadIdx.x + blockIdx.x * blockDim.x;
	int indexY = threadIdx.y + blockIdx.y * blockDim.y;
  if (indexX < N && indexY < M)
  {
      int index = indexY * N + indexX;
      int transposedIndex = indexX * M + indexY;

	    // local index
	    int localIndexX = threadIdx.x;
	    int localIndexY = threadIdx.y;

	    // reading from global memory in coalesed manner and performing tanspose in shared memory
	    sharedMemory[localIndexX][localIndexY] = input[index];

	    __syncthreads();

	    // writing into global memory in coalesed fashion via transposed data in shared memory
	    output[transposedIndex] = sharedMemory[localIndexX][localIndexY];
  }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void matrix_transpose_shared_uncoalesed_no_conflict(T *input, T *output) {

	__shared__ T sharedMemory [BLOCK_SIZE] [BLOCK_SIZE + 1];

	// global index
	int indexX = threadIdx.x + blockIdx.x * blockDim.x;
	int indexY = threadIdx.y + blockIdx.y * blockDim.y;
  if (indexX < N && indexY < M)
  {
      int index = indexY * N + indexX;
      int transposedIndex = indexX * M + indexY;

	    // local index
	    int localIndexX = threadIdx.x;
	    int localIndexY = threadIdx.y;

	    // reading from global memory in coalesed manner and performing tanspose in shared memory
	    sharedMemory[localIndexX][localIndexY] = input[index];

	    __syncthreads();

	    // writing into global memory in coalesed fashion via transposed data in shared memory
	    output[transposedIndex] = sharedMemory[localIndexX][localIndexY];
  }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void matrix_transpose_shared_coalesed(T *input, T *output) {

	__shared__ T sharedMemory [BLOCK_SIZE] [BLOCK_SIZE];

	// global index
	int indexX = threadIdx.x + blockIdx.x * blockDim.x;
	int indexY = threadIdx.y + blockIdx.y * blockDim.y;
  // local index
	int localIndexX = threadIdx.x;
	int localIndexY = threadIdx.y;

  if (indexX < N && indexY < M)
  {
      int index = indexY * N + indexX;
      
	    // reading from global memory in coalesed manner and performing tanspose in shared memory
	    sharedMemory[localIndexX][localIndexY] = input[index];
  }
  __syncthreads();

  // transposed global memory index
	int tindexX = threadIdx.x + blockIdx.y * blockDim.x;
	int tindexY = threadIdx.y + blockIdx.x * blockDim.y;
  if(tindexX < M && tindexY < N)
  {
      int transposedIndex = tindexY * M + tindexX;
      // writing into global memory in coalesed fashion via transposed data in shared memory
	    output[transposedIndex] = sharedMemory[localIndexY][localIndexX];
  }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void matrix_transpose_shared_coalesed_no_conflict(T *input, T *output) {

	__shared__ T sharedMemory [BLOCK_SIZE] [BLOCK_SIZE + 1];

	// global index
	int indexX = threadIdx.x + blockIdx.x * blockDim.x;
	int indexY = threadIdx.y + blockIdx.y * blockDim.y;
  // local index
	int localIndexX = threadIdx.x;
	int localIndexY = threadIdx.y;

  if (indexX < N && indexY < M)
  {
      int index = indexY * N + indexX;
      
	    // reading from global memory in coalesed manner and performing tanspose in shared memory
	    sharedMemory[localIndexX][localIndexY] = input[index];
  }
  __syncthreads();

  // transposed global memory index
	int tindexX = threadIdx.x + blockIdx.y * blockDim.x;
	int tindexY = threadIdx.y + blockIdx.x * blockDim.y;
  if(tindexX < M && tindexY < N)
  {
      int transposedIndex = tindexY * M + tindexX;
      // writing into global memory in coalesed fashion via transposed data in shared memory
	    output[transposedIndex] = sharedMemory[localIndexY][localIndexX];
  }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
void print_output(T *a, T *b)
{
    for (int i = 0; i < N * M; ++i)
    {
        if (i % N == 0)
        {
            std::cout<<std::endl;
        }
        std::cout<<a[i]<<" ";
    }
    std::cout<<std::endl;
    for (int i = 0; i < N * M; ++i)
    {
        if (i % M == 0)
        {
            std::cout<<std::endl;
        }
        std::cout<<b[i]<<" ";
    }
}

int main()
{
    // Allocate space for host copies of a, b
    thrust::host_vector<TYPE> a(N*M);
    thrust::host_vector<TYPE> b(N*M);

    // Allocate space for device copies of a, b
    thrust::device_vector<TYPE> d_a(N*M, 0);
    thrust::device_vector<TYPE> d_b(N*M, 0);
    thrust::sequence(d_a.begin(), d_a.end(), 0, 1);

    dim3 threads_per_block(BLOCK_SIZE, BLOCK_SIZE, 1);
    dim3 no_of_blocks((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE, 1);

    warm_up<<<no_of_blocks, threads_per_block>>>();
    matrix_transpose_naive<<<no_of_blocks, threads_per_block>>>(thrust::raw_pointer_cast(d_a.data()),thrust::raw_pointer_cast(d_b.data()));
    matrix_transpose_shared_uncoalesed<<<no_of_blocks, threads_per_block>>>(thrust::raw_pointer_cast(d_a.data()),thrust::raw_pointer_cast(d_b.data()));
    matrix_transpose_shared_uncoalesed_no_conflict<<<no_of_blocks, threads_per_block>>>(thrust::raw_pointer_cast(d_a.data()),thrust::raw_pointer_cast(d_b.data()));
    matrix_transpose_shared_coalesed<<<no_of_blocks, threads_per_block>>>(thrust::raw_pointer_cast(d_a.data()),thrust::raw_pointer_cast(d_b.data()));
    matrix_transpose_shared_coalesed_no_conflict<<<no_of_blocks, threads_per_block>>>(thrust::raw_pointer_cast(d_a.data()),thrust::raw_pointer_cast(d_b.data()));

    thrust::copy(d_a.begin(), d_a.end(), a.begin());
    thrust::copy(d_b.begin(), d_b.end(), b.begin());

    // print_output(a.data(), b.data());

    return 0;
}

In [None]:
!nvcc -o matrix_transpose matrix_transpose.cu

In [None]:
!./matrix_transpose

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!apt update
!apt install ./drive/MyDrive/Nsight/nsight-systems-2023.2.3_2023.2.3.1001-1_amd64.deb
!apt --fix-broken install

In [None]:
!nsys profile -o report_matrix_transpose ./matrix_transpose