<a href="https://colab.research.google.com/github/UrsachiGabriela/PGPU_proiect/blob/main/Introduction_to_CUDA_%2B_profiling_using_google_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PGPU - proiect


# Generare graf

In [18]:
%%writefile graph_generator.cpp

#include <iostream>
#include <vector>
#include <fstream>
#include <cstdlib>
#include <ctime>
#include <algorithm>

using namespace std;

vector<vector<int>> generateRandomGraph(int numNodes, int numEdges) {
    vector<vector<int>> graph(numNodes);
    srand(time(0));

    for (int i = 0; i < numEdges; ++i) {
        int u = rand() % numNodes;
        int v = rand() % numNodes;

        // Evită buclele și muchiile duplicate
        while (u == v || find(graph[u].begin(), graph[u].end(), v) != graph[u].end()) {
            u = rand() % numNodes;
            v = rand() % numNodes;
        }
        graph[u].push_back(v);
        graph[v].push_back(u); // Dacă este un graf neorientat
    }

    return graph;
}

void writeGraphToFile(const string& filename, const vector<vector<int>>& graph) {
    ofstream outputFile(filename);
    if (!outputFile) {
        cerr << "Eroare la deschiderea fișierului pentru scriere!" << endl;
        return;
    }

    for (const auto& neighbors : graph) {
        for (size_t i = 0; i < neighbors.size(); ++i) {
            outputFile << neighbors[i];
            if (i != neighbors.size() - 1) {
                outputFile << " ";
            }
        }
        outputFile << endl; // Sfârșitul fiecărei liste de vecini
    }

    outputFile.close();
}

int main() {
    int numNodes = 1000; // De exemplu
    int numEdges = 5000;

    vector<vector<int>> graph = generateRandomGraph(numNodes, numEdges);

    string filename = "graph_output.txt";
    writeGraphToFile(filename, graph);

    cout << "Graf generat și salvat în " << filename << endl;

    return 0;
}



Overwriting graph_generator.cpp


In [19]:
!g++ graph_generator.cpp -o graph_generator
!./graph_generator


Graf generat și salvat în graph_output.txt


In [20]:
from google.colab import files
files.download('graph_output.txt')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# CPU

In [24]:
%%writefile bfs_cpu.cpp

#include <iostream>
#include <vector>
#include <queue>
#include <set>
#include <climits>
#include <fstream>
#include <sstream>
#include <string>

using namespace std;

vector<vector<int>> readGraphFromFile(const string& filename) {
    ifstream inputFile(filename);
    if (!inputFile) {
        cerr << "Eroare la deschiderea fisierului!" << endl;
        return {};
    }

    vector<vector<int>> graph;
    string line;

    while (getline(inputFile, line)) {
        // Parsare linie pentru vecini
        stringstream ss(line);
        vector<int> neighbors;
        int neighbor;

        while (ss >> neighbor) {
            neighbors.push_back(neighbor);
        }

        // Adăugare vecini în graful principal
        graph.push_back(neighbors);
    }

    inputFile.close();
    return graph;
}



void bfs_cpu(const vector<vector<int>>& adjacencyList, int start_node, vector<int>& distance) {
    queue<int> to_visit_queue;

    fill(distance.begin(), distance.end(), INT_MAX);
    distance[start_node] = 0;

    to_visit_queue.push(start_node);

    while (!to_visit_queue.empty()) {
        int current = to_visit_queue.front();
        to_visit_queue.pop();

        cout << current <<", distance: " << distance[current] << endl;

        for (int neighbor : adjacencyList[current]) {
            if (distance[neighbor] == INT_MAX) { // neighbor not visited yet
                distance[neighbor] = distance[current] + 1;

                to_visit_queue.push(neighbor);
            }
        }
    }
}

int main() {
    string filename = "graph_output.txt";
    vector<vector<int>> graph = readGraphFromFile(filename);

    //  for (int i = 0; i < graph.size(); ++i) {
    //     cout << "Node " << i << ": ";
    //     for (int neighbor : graph[i]) {
    //         cout << neighbor << " ";
    //     }
    //     cout << endl;
    // }

    // vector<vector<int>> graph = {
    //     {1, 2},
    //     {0, 3, 4},
    //     {0, 4, 5},
    //     {1, 6},
    //     {1, 2, 5},
    //     {2, 4},
    //     {3}
    // };

    int start_node = 0;
    vector<int> distance;
    distance = vector<int>(graph.size());

    bfs_cpu(graph, start_node, distance);

    return 0;
}

Overwriting bfs_cpu.cpp


In [25]:
%%shell
g++ bfs_cpu.cpp -o bfs_cpu



In [26]:
%%shell
./bfs_cpu

0, distance: 0
245, distance: 1
492, distance: 1
929, distance: 1
491, distance: 1
235, distance: 1
931, distance: 1
757, distance: 1
7, distance: 1
923, distance: 1
455, distance: 2
540, distance: 2
275, distance: 2
361, distance: 2
642, distance: 2
688, distance: 2
355, distance: 2
517, distance: 2
549, distance: 2
133, distance: 2
571, distance: 2
640, distance: 2
739, distance: 2
262, distance: 2
691, distance: 2
574, distance: 2
744, distance: 2
716, distance: 2
358, distance: 2
329, distance: 2
81, distance: 2
792, distance: 2
681, distance: 2
316, distance: 2
130, distance: 2
734, distance: 2
808, distance: 2
347, distance: 2
120, distance: 2
440, distance: 2
920, distance: 2
874, distance: 2
837, distance: 2
180, distance: 2
917, distance: 2
297, distance: 2
19, distance: 2
430, distance: 2
548, distance: 2
309, distance: 2
672, distance: 2
186, distance: 2
671, distance: 2
95, distance: 2
669, distance: 2
390, distance: 2
536, distance: 2
438, distance: 2
485, distance: 2
348,



# GPU

In [29]:
%%writefile bfs2.cu

#include <iostream>
#include <vector>
#include <climits>
#include <cuda_runtime.h>
#include <fstream>
#include <sstream>
#include <string>

using namespace std;


cudaError_t bfs_gpu(const vector<vector<int>> &graph, int start_node, vector<int> &distance);

__global__
void simpleKernel(int level) {
    printf("simple kernel\n");
}


__global__
void computeNextLevel(int *adjacencyList, int *offsets, int *distance, int *currentFrontier, int frontierSize, int *nextFrontier, int *nextFrontierSize, int level) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= frontierSize) return;

    int currentNode = currentFrontier[tid];
    int start = offsets[currentNode];
    int end = offsets[currentNode + 1];

    for (int i = start; i < end; ++i) {
        int neighbor = adjacencyList[i];

        if (atomicCAS(&distance[neighbor], INT_MAX, level + 1) == INT_MAX) {
            int position = atomicAdd(nextFrontierSize, 1);
            nextFrontier[position] = neighbor;
        }
    }
}

cudaError_t bfs_gpu(const vector<vector<int>> &graph, int start_node, vector<int> &distance) {
    cudaError_t cudaStatus;

    int frontier_size = 1;
    const int NEXT_FRONTIER_SIZE = 0;
    int level = 0;
    int threads_per_block = 256;

    int num_nodes = graph.size();

    vector<int> adjacencyList;
    vector<int> offsets(num_nodes + 1, 0);

    for (int i = 0; i < num_nodes; ++i) {
        offsets[i + 1] = offsets[i] + graph[i].size();
        adjacencyList.insert(adjacencyList.end(), graph[i].begin(), graph[i].end());
    }

    int *d_adjacencyList = 0;
    int *d_offsets = 0;
    int *d_distance = 0;
    int *d_frontier = 0;
    int *d_nextFrontier = 0;
    int *d_nextFrontierSize = 0;

    cudaStatus = cudaMalloc((void**)&d_adjacencyList, adjacencyList.size() * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for d_adjacencyList!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_offsets, offsets.size() * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for d_offsets!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_distance, num_nodes * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for d_distance!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_frontier, num_nodes * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for d_frontier!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_nextFrontier, num_nodes * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for d_nextFrontier!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_nextFrontierSize, sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for d_nextFrontierSize!");
        goto Error;
    }


    // host to device
    cudaMemcpy(d_adjacencyList, adjacencyList.data(), adjacencyList.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_offsets, offsets.data(), offsets.size() * sizeof(int), cudaMemcpyHostToDevice);

    distance.assign(num_nodes, INT_MAX);
    distance[start_node] = 0;
    cudaMemcpy(d_distance, distance.data(), num_nodes * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_frontier, &start_node, sizeof(int), cudaMemcpyHostToDevice);

    while (frontier_size > 0) {
        cudaMemcpy(d_nextFrontierSize, &NEXT_FRONTIER_SIZE, sizeof(int), cudaMemcpyHostToDevice);

        int *d_currentQueue = (level % 2 == 0) ? d_frontier : d_nextFrontier;
        int *d_nextQueue = (level % 2 == 0) ? d_nextFrontier : d_frontier;

        int blocks = (frontier_size + threads_per_block - 1) / threads_per_block;
        computeNextLevel<<<blocks, threads_per_block>>>(d_adjacencyList, d_offsets, d_distance, d_currentQueue, frontier_size, d_nextQueue, d_nextFrontierSize, level);

        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            cout << "CUDA error: " << err << endl;
            exit(-1);
        }
        cudaDeviceSynchronize();

        cudaMemcpy(&frontier_size, d_nextFrontierSize, sizeof(int), cudaMemcpyDeviceToHost);

        //cout << "Level " << level << ", frontier size: " << frontier_size << endl;

        ++level;
    }

    // device to host
    cudaMemcpy(distance.data(), d_distance, num_nodes * sizeof(int), cudaMemcpyDeviceToHost);

Error:
    cudaFree(d_adjacencyList);
    cudaFree(d_offsets);
    cudaFree(d_distance);
    cudaFree(d_frontier);
    cudaFree(d_nextFrontier);
    cudaFree(d_nextFrontierSize);

    return cudaStatus;
}



vector<vector<int>> readGraphFromFile(const string& filename) {
    ifstream inputFile(filename);
    if (!inputFile) {
        cerr << "Eroare la deschiderea fisierului!" << endl;
        return {};
    }

    vector<vector<int>> graph;
    string line;

    while (getline(inputFile, line)) {
        // Parsare linie pentru vecini
        stringstream ss(line);
        vector<int> neighbors;
        int neighbor;

        while (ss >> neighbor) {
            neighbors.push_back(neighbor);
        }

        // Adăugare vecini în graful principal
        graph.push_back(neighbors);
    }

    inputFile.close();
    return graph;
}

int main() {
    string filename = "graph_output.txt";
    vector<vector<int>> graph = readGraphFromFile(filename);



    int start_node = 0;
    vector<int> distance;

    cudaError_t cudaStatus = bfs_gpu(graph, start_node, distance);

    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "bfs_gpu failed!");
        return 1;
    }

    for (int i = 0; i < distance.size(); ++i) {
        cout << "Node " << i << ", Distance: " << distance[i] << endl;
    }

    return 0;
}

Overwriting bfs2.cu


That's another big speedup from running multiple blocks!

In [30]:
%%shell

nvcc bfs2.cu -o bfs2

nvprof  ./bfs2

nvprof --print-gpu-trace ./bfs2

ncu -f --set full -o bfs2 ./bfs2

==5989== NVPROF is profiling process 5989, command: ./bfs2
Node 0, Distance: 0
Node 1, Distance: 3
Node 2, Distance: 4
Node 3, Distance: 4
Node 4, Distance: 4
Node 5, Distance: 3
Node 6, Distance: 4
Node 7, Distance: 1
Node 8, Distance: 4
Node 9, Distance: 3
Node 10, Distance: 4
Node 11, Distance: 3
Node 12, Distance: 3
Node 13, Distance: 4
Node 14, Distance: 4
Node 15, Distance: 3
Node 16, Distance: 3
Node 17, Distance: 3
Node 18, Distance: 3
Node 19, Distance: 2
Node 20, Distance: 3
Node 21, Distance: 3
Node 22, Distance: 4
Node 23, Distance: 4
Node 24, Distance: 4
Node 25, Distance: 4
Node 26, Distance: 4
Node 27, Distance: 4
Node 28, Distance: 3
Node 29, Distance: 3
Node 30, Distance: 3
Node 31, Distance: 3
Node 32, Distance: 3
Node 33, Distance: 3
Node 34, Distance: 4
Node 35, Distance: 4
Node 36, Distance: 3
Node 37, Distance: 3
Node 38, Distance: 3
Node 39, Distance: 3
Node 40, Distance: 3
Node 41, Distance: 3
Node 42, Distance: 3
Node 43, Distance: 4
Node 44, Distance: 4
Node 4

