In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!pip install nvcc4jupyter


Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [3]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmplfr_bqvv".


In [9]:
%%cuda
#include <iostream>
#include <vector>
#include <limits.h>
#include <chrono>

#define MAX_V 4  // Maximum number of nodes

__device__ const int dist[MAX_V][MAX_V] = {
    { 0, 10, 15, 20 },
    { 10, 0, 35, 25 },
    { 15, 35, 0, 30 },
    { 20, 25, 30, 0 }
};

// Kernel to compute TSP
__device__ int tspKernel(int mask, int pos, int *dp, int n) {
    // Base case: if all cities have been visited
    if (mask == ((1 << n) - 1)) {
        return dist[pos][0]; // Return to the starting city (city 0)
    }

    int index = mask * n + pos; // Create a unique index for dp
    if (dp[index] != -1) {
        return dp[index]; // Return cached result if available
    }

    int ans = INT_MAX; // Initialize answer to maximum
    for (int city = 0; city < n; city++) {
        // If the city has not been visited
        if ((mask & (1 << city)) == 0) {
            int newAns = dist[pos][city] + tspKernel(mask | (1 << city), city, dp, n);
            ans = min(ans, newAns); // Update answer
        }
    }

    dp[index] = ans; // Cache the result
    return ans; // Return computed result
}

// Kernel wrapper for launching TSP
__global__ void tspLauncher(int *dp, int n, int *result) {
    result[0] = tspKernel(1, 0, dp, n); // Start TSP with node 0 and mask 1
}

int main() {
    int n = MAX_V;  // Number of cities

    // Initialize DP table in host memory
    int *dp = new int[(1 << n) * n];
    std::fill(dp, dp + (1 << n) * n, -1);

    int *d_dp, *d_result;
    int h_result;

    // Allocate device memory
    cudaMalloc((void**)&d_dp, (1 << n) * n * sizeof(int));
    cudaMalloc((void**)&d_result, sizeof(int));

    // Copy dp to device
    cudaMemcpy(d_dp, dp, (1 << n) * n * sizeof(int), cudaMemcpyHostToDevice);

    // Measure execution time
    auto start = std::chrono::high_resolution_clock::now();

    // Launch the TSP kernel
    tspLauncher<<<1, 1>>>(d_dp, n, d_result);
    cudaDeviceSynchronize(); // Wait for the kernel to finish

    // Copy result back to host
    cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> duration = end - start;

    std::cout << "Nodes: " << n << " Time: " << duration.count() << " seconds. Min Cost: " << h_result << "\n";

    // Free device memory
    cudaFree(d_dp);
    cudaFree(d_result);

    // Free host memory
    delete[] dp;

    return 0;
}


Nodes: 4 Time: 0.000438426 seconds. Min Cost: 80



In [16]:
%%cuda
#include <iostream>
#include <vector>
#include <limits>
#include <chrono>
#include <random>
#include <cuda_runtime.h>

#define MAX 20  // Maximum number of nodes
#define THREADS_PER_BLOCK 256
#define GPU_MAX_NODES 9  // Maximum nodes for GPU computation

// Error checking macro for CUDA calls
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

// Function to generate a random adjacency matrix
void generateRandomAdjacencyMatrix(int nodes, int maxWeight, std::vector<std::vector<int>>& adj) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(1, maxWeight);

    adj.resize(nodes, std::vector<int>(nodes));
    for (int i = 0; i < nodes; ++i) {
        for (int j = 0; j < nodes; ++j) {
            if (i == j) {
                adj[i][j] = 0;  // Diagonal elements are zero (no self-loop)
            } else {
                adj[i][j] = dis(gen);
            }
        }
    }
}

// CPU implementation of TSP using dynamic programming
long long tspCPU(int mask, int pos, const std::vector<std::vector<int>>& adj, std::vector<std::vector<long long>>& dp) {
    int n = adj.size();
    if (mask == (1 << n) - 1) {
        return adj[pos][0]; // Return to the starting city
    }

    if (dp[mask][pos] != -1) {
        return dp[mask][pos];
    }

    long long ans = std::numeric_limits<long long>::max();
    for (int city = 0; city < n; city++) {
        if ((mask & (1 << city)) == 0) {
            long long newAns = adj[pos][city] + tspCPU(mask | (1 << city), city, adj, dp);
            ans = std::min(ans, newAns);
        }
    }

    return dp[mask][pos] = ans;
}

// Device function for TSP using dynamic programming
__device__ long long tspDP(int mask, int pos, const int *adj, long long *dp, int n) {
    if (mask == (1 << n) - 1) {
        return adj[pos * n + 0]; // Return to the starting city
    }

    int index = mask * n + pos; // Create a unique index for dp
    if (dp[index] != -1) {
        return dp[index];
    }

    long long ans = LLONG_MAX;
    for (int city = 0; city < n; city++) {
        if ((mask & (1 << city)) == 0) {
            long long newAns = (long long)adj[pos * n + city] + tspDP(mask | (1 << city), city, adj, dp, n);
            ans = min(ans, newAns);
        }
    }

    return dp[index] = ans;
}

// Kernel to initialize the DP table
__global__ void initDP(long long *dp, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        dp[idx] = -1;
    }
}

// Kernel to launch the TSP DP calculation
__global__ void tspLauncher(int *adj, long long *dp, int n, long long *result) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        result[0] = tspDP(1, 0, adj, dp, n); // Start TSP from node 0 with mask 1
    }
}

int main() {
    int maxWeight = 100;

    for (int nodes = 4; nodes <= MAX; nodes++) {
        std::vector<std::vector<int>> adj;
        generateRandomAdjacencyMatrix(nodes, maxWeight, adj);

        auto start = std::chrono::high_resolution_clock::now();
        long long result;

        if (nodes <= GPU_MAX_NODES) {
            // GPU implementation
            int *d_adj;
            long long *d_dp, *d_result;
            long long *h_result = new long long[1];

            gpuErrchk(cudaMalloc((void**)&d_adj, nodes * nodes * sizeof(int)));
            gpuErrchk(cudaMalloc((void**)&d_dp, (1LL << nodes) * nodes * sizeof(long long)));
            gpuErrchk(cudaMalloc((void**)&d_result, sizeof(long long)));

            // Copy adjacency matrix to device
            std::vector<int> flat_adj(nodes * nodes);
            for (int i = 0; i < nodes; ++i) {
                for (int j = 0; j < nodes; ++j) {
                    flat_adj[i * nodes + j] = adj[i][j];
                }
            }
            gpuErrchk(cudaMemcpy(d_adj, flat_adj.data(), nodes * nodes * sizeof(int), cudaMemcpyHostToDevice));

            // Initialize DP table
            int dpSize = (1LL << nodes) * nodes;
            int blocks = (dpSize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
            initDP<<<blocks, THREADS_PER_BLOCK>>>(d_dp, dpSize);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());

            // Launch the TSP kernel
            tspLauncher<<<1, 1>>>(d_adj, d_dp, nodes, d_result);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());

            // Copy result back to host
            gpuErrchk(cudaMemcpy(h_result, d_result, sizeof(long long), cudaMemcpyDeviceToHost));

            result = h_result[0];

            // Free device memory
            gpuErrchk(cudaFree(d_adj));
            gpuErrchk(cudaFree(d_dp));
            gpuErrchk(cudaFree(d_result));
            delete[] h_result;
        } else {
            // CPU implementation
            std::vector<std::vector<long long>> dp(1 << nodes, std::vector<long long>(nodes, -1));
            result = tspCPU(1, 0, adj, dp);
        }

        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> duration = end - start;

        std::cout << "Nodes: " << nodes << " Time: " << duration.count() << " seconds. Cost: " << result << "\n";
    }

    return 0;
}

Nodes: 4 Time: 0.0976705 seconds. Cost: 99
Nodes: 5 Time: 0.000285805 seconds. Cost: 217
Nodes: 6 Time: 0.000461098 seconds. Cost: 204
Nodes: 7 Time: 0.000904126 seconds. Cost: 166
Nodes: 8 Time: 0.00195309 seconds. Cost: 171
Nodes: 9 Time: 0.00485123 seconds. Cost: 213
Nodes: 10 Time: 0.000644813 seconds. Cost: 132
Nodes: 11 Time: 0.00134765 seconds. Cost: 217
Nodes: 12 Time: 0.00309146 seconds. Cost: 130
Nodes: 13 Time: 0.00718959 seconds. Cost: 166
Nodes: 14 Time: 0.0171433 seconds. Cost: 142
Nodes: 15 Time: 0.0396535 seconds. Cost: 120
Nodes: 16 Time: 0.0935338 seconds. Cost: 174
Nodes: 17 Time: 0.240396 seconds. Cost: 195
Nodes: 18 Time: 0.612629 seconds. Cost: 167
Nodes: 19 Time: 1.4945 seconds. Cost: 182
Nodes: 20 Time: 4.15719 seconds. Cost: 165

