In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-ofka0he8
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-ofka0he8
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=d60c6e9907f2cf84679e6ca7a25d4d8c0bec15ca60fdbf11a994bd1507422e53
  Stored in directory: /tmp/pip-ephem-wheel-cache-kuofl6o8/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [5]:
%%cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <omp.h>
#include <curand.h>
#include <curand_kernel.h>

#define POPULATION_SIZE 100
#define MAX_ITERATIONS 10000000
#define MAX_TIME 300

#define MAX_TASKS 1000
#define MAX_PROCESSORS 50
#define NUM_THREADS 512

typedef unsigned char ProcId;
typedef unsigned int* RandSeed;

typedef struct {
    int pcmax;
    ProcId* tasks;
} Solution;

typedef struct {
    int best;
    int iterations;
} CudaOutput;

__device__ __constant__ int processorsNum;
__device__ __constant__ int tasksNum;
__device__ __constant__ int taskTimes[2048];


__device__ int generateRandomInt(int a, int b, int seed){
    curandState_t state;
    curand_init(seed, 0, 0, &state);

    int random = curand(&state);

    return a + random % (b - a + 1);
}
__device__ unsigned int randInt(RandSeed seed) {
    unsigned int b = 1 & (*seed ^ (*seed >> 1) ^ (*seed >> 21) ^ (*seed >> 31));
    *seed = (*seed << 1) | b;
    return *seed;
}

void load_instance(int* pNum, int*tNum, int** tTimes){
    FILE* file;
    if(!(file = fopen("m10n200.txt", "r"))){
        fprintf(stderr, "No such file. Exit program.");
        exit(1);
    };

    if(!fscanf(file, "%d", pNum)) exit(1);
    if(!fscanf(file, "%d", tNum)) exit(1);

    *tTimes = (int*)malloc(sizeof(int) * *tNum);
    int res;
    for(int i=0;  i< *tNum; i++) {
        res = fscanf(file, "%d", &(*tTimes)[i]);
        if (res != 1) exit(1);
    }
    fclose(file);

    fprintf(stderr,"P: %d \n", *pNum);
    fprintf(stderr,"T: %d \n", *tNum);

    fprintf(stderr,"\n");
}

__device__ void evaluateCmax(Solution* solution){
    int processorsSum[MAX_PROCESSORS];

    for(int i = 0; i < processorsNum; i++){
        processorsSum[i] = 0;
    }

    for(int i = 0; i < tasksNum; i++){
        processorsSum[solution->tasks[i] ] += taskTimes[i];
    }

    int maxProcessorTime = 0;
    for(int i = 0; i < processorsNum; i++){
        if(processorsSum[i] > maxProcessorTime){
            maxProcessorTime = processorsSum[i];
        }
    }
    solution->pcmax = maxProcessorTime;
}


__device__ void generateFirstSolution(Solution* sol, RandSeed seed){
    for(int i=0; i<tasksNum; i++){
        sol->tasks[i] = randInt(seed) % processorsNum;
    }
    evaluateCmax(sol);
}

__device__ void generateFirstPopulation(Solution* pop, RandSeed seed){
    for(int i=0; i<POPULATION_SIZE; i++){
        generateFirstSolution(&pop[i], seed);
    }
}

__device__ void getBestSolution(Solution* population, Solution* bestSolution){
    for(int i = 0; i < POPULATION_SIZE; i++){
        if(population[i].pcmax < bestSolution->pcmax){
            bestSolution->pcmax = population[i].pcmax;
            for(int j = 0; j < tasksNum; ++j){
                bestSolution->tasks[j] = population[i].tasks[j];
            }
        }
    }
}
__device__ int getBestParentIndex(Solution* population){
    int best = 1000000;
    int index = 0;
    for(int i=0; i<POPULATION_SIZE; i++){
        if(population[i].pcmax < best){
            best = population[i].pcmax;
            index = i;
        }
    }
    return index;
}

__device__ int getSecondBestParentIndex(Solution* population, int bestParentIndex){
    int best = 1000000;
    int index = 0;
    for(int i=0; i<POPULATION_SIZE; i++){
        if(i == bestParentIndex) continue;

        if(population[i].pcmax < best){
            best = population[i].pcmax;
            index = i;
        }
    }
    return index;
}
__device__ void crossOver(Solution* parent1, Solution* parent2, Solution* child, RandSeed seed){
    int crossPoint = randInt(seed) % tasksNum;

    for(int i = 0; i < tasksNum; ++i){
        if(i >= crossPoint){
            child->tasks[i] = parent2->tasks[i];
        } else {
            child->tasks[i] = parent1->tasks[i];
        }
    }
    evaluateCmax(child);
}
__device__ void generateNextPopulation(Solution* population, RandSeed seed){
    int p1 = getBestParentIndex(population);
    int p2 = getSecondBestParentIndex(population, p1);

    for(int i=0; i<POPULATION_SIZE; i++){
        if(randInt(seed) % 100 > 20) continue;
        if (i == p1 || i == p2) continue;

        crossOver(&population[p1], &population[p2], &population[i], seed);
    }
}
__device__ void mutate(Solution* sol, RandSeed seed){
    int randomIndex1 = randInt(seed) % (tasksNum-1);
    int randomIndex2;

    do {
        randomIndex2 = randInt(seed) % (tasksNum-1);
    } while (randomIndex1 == randomIndex2);

    ProcId temp = sol->tasks[randomIndex1];
    sol->tasks[randomIndex1] = sol->tasks[randomIndex2];
    sol->tasks[randomIndex2] = temp;

    evaluateCmax(sol);
}

__device__ void mutatePopulation(Solution* population, RandSeed seed){
    for(int i=0; i<POPULATION_SIZE; i++){

        if(randInt(seed) % 100 > 5) continue;
        mutate(&population[i], seed);
    }
}
__global__ void genetic(const int limit, CudaOutput* output, unsigned int randSeed){
    int tid = threadIdx.x;
    randSeed ^= (tid & 0xfff) << 19;

    __shared__ volatile int earlyStop;
    if(tid == 0) earlyStop = 0;

    __shared__ CudaOutput localOutput[NUM_THREADS];

    __syncthreads();

    int iterations = 0;

    ProcId taskMem[MAX_TASKS * POPULATION_SIZE];
    Solution population[POPULATION_SIZE];
    for(int i = 0; i < POPULATION_SIZE; ++i){
        population[i].tasks = &taskMem[i * tasksNum];
    }


    ProcId bestSolutionTasks[MAX_TASKS];
    Solution bestSolution;
    bestSolution.tasks = bestSolutionTasks;
    bestSolution.pcmax = 1000000;

    //first population
    generateFirstPopulation(population, &randSeed);
    getBestSolution(population, &bestSolution);

    // generate next populations
    while(iterations < MAX_ITERATIONS) {
        generateNextPopulation(population, &randSeed);
        mutatePopulation(population, &randSeed);
        getBestSolution(population, &bestSolution);


        if(bestSolution.pcmax <= limit) earlyStop = true;
        if (earlyStop) break;

        iterations++;
    }

    localOutput[tid].iterations = iterations;
    localOutput[tid].best = bestSolution.pcmax;

    __threadfence();
    __syncthreads();

      if(tid == 0){
        output->best = 1000000;
        for(int i = 0; i < blockDim.x; ++i){
            if(localOutput[i].best < output->best){
                output->best = localOutput[i].best;
                output->iterations = localOutput[i].iterations;
            }
      }
    }
}

int main(){
    int pNum;
    int tNum;
    int* tTimes;

    int limit = 11001;

    load_instance(&pNum, &tNum, &tTimes);

    cudaMemcpyToSymbol(processorsNum, &pNum, sizeof(int));
    cudaMemcpyToSymbol(tasksNum, &tNum, sizeof(int));
    cudaMemcpyToSymbol(taskTimes, tTimes, sizeof(int) * tNum);

    CudaOutput hostOutput;
    CudaOutput* deviceOutput;
    cudaMalloc(&deviceOutput, sizeof(CudaOutput));

    fprintf(stderr, "Error: %s\n", cudaGetErrorString(cudaGetLastError()));

    const unsigned int seed = (unsigned int)(time(NULL) & 0x7fffffff);
    const clock_t startTime = clock();

    genetic<<<1, NUM_THREADS>>>(limit, deviceOutput, seed);
    cudaDeviceSynchronize();
    const clock_t workTime = clock() - startTime;

    fprintf(stderr, "Error: %s\n", cudaGetErrorString(cudaGetLastError()));

    cudaMemcpy(&hostOutput, deviceOutput, sizeof(CudaOutput), cudaMemcpyDeviceToHost);

    fprintf(stderr, "Time: %.1fs\n", (double)workTime / CLOCKS_PER_SEC);
    fprintf(stderr, "Result: %d\n", hostOutput.best);


    cudaFree(deviceOutput);



    return 0;
}

P: 10 
T: 200 

Error: no error
Error: no error
Time: 450.0s
Result: 11001

