In [8]:
Hybrid_code = r"""
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <cuda_runtime.h>

#define IDX(i,j,N) ((i)*(N)+(j))

__global__ void update_gpu(double *u, double *uNew, int Nx, int Ny, int start, int end, 
                          double dx, double dy, double alpha, double dt) {
    int i = blockIdx.x * blockDim.x + threadIdx.x + start;
    int j = blockIdx.y * blockDim.y + threadIdx.y + 1;
    if (i >= start && i < end && j < Ny-1) {
        const double dx2 = dx*dx;
        const double dy2 = dy*dy;
        const double factor = alpha*dt;
        
        double uxx = (u[IDX(i+1,j,Ny)] - 2.0*u[IDX(i,j,Ny)] + u[IDX(i-1,j,Ny)]) / dx2;
        double uyy = (u[IDX(i,j+1,Ny)] - 2.0*u[IDX(i,j,Ny)] + u[IDX(i,j-1,Ny)]) / dy2;
        uNew[IDX(i,j,Ny)] = fma(factor, (uxx + uyy), u[IDX(i,j,Ny)]);
    }
}

int main() {
    int Nx = 200, Ny = 200, Nt = 1000;
    double Lx = 1.0, Ly = 1.0;
    double alpha = 0.0001;
    double dx = Lx / (Nx - 1), dy = Ly / (Ny - 1);
    double dt = 0.25 * fmin(dx*dx, dy*dy) / alpha;
    size_t N = Nx * Ny;

    double *u = (double*)malloc(N * sizeof(double));
    double *uNew = (double*)malloc(N * sizeof(double));

    for(int i = 0; i < Nx; i++) {
        for(int j = 0; j < Ny; j++) {
            double x = i * dx - Lx / 2, y = j * dy - Ly / 2;
            u[IDX(i,j,Ny)] = exp(-50*(x*x + y*y));
        }
    }

    double *d_u, *d_uNew;
    cudaMalloc(&d_u, N * sizeof(double));
    cudaMalloc(&d_uNew, N * sizeof(double));

    cudaMemcpy(d_u, u, N * sizeof(double), cudaMemcpyHostToDevice);

    int cpu_start = 1, cpu_end = Nx/2;
    int gpu_start = Nx/2, gpu_end = Nx - 1;

    dim3 block(16, 16);
    dim3 grid((Nx + block.x - 1)/block.x, (Ny + block.y - 1)/block.y);


    int num_threads = 4;  
    omp_set_num_threads(num_threads);
    

    double t0 = omp_get_wtime();

    for (int n = 0; n < Nt; n++) {
        update_gpu<<<grid, block>>>(d_u, d_uNew, Nx, Ny, gpu_start, gpu_end, dx, dy, alpha, dt);
        #pragma omp parallel for collapse(2)
        for (int i = cpu_start; i < cpu_end; i++) {
            for (int j = 1; j < Ny - 1; j++) {
                double uxx = (u[IDX(i+1,j,Ny)] - 2*u[IDX(i,j,Ny)] + u[IDX(i-1,j,Ny)]) / (dx*dx);
                double uyy = (u[IDX(i,j+1,Ny)] - 2*u[IDX(i,j,Ny)] + u[IDX(i,j-1,Ny)]) / (dy*dy);
                uNew[IDX(i,j,Ny)] = u[IDX(i,j,Ny)] + alpha*dt*(uxx + uyy);
            }
            
        }
        // cudaDeviceSynchronize();

        cudaMemcpy(&uNew[IDX(gpu_start,0,Ny)], &d_uNew[IDX(gpu_start,0,Ny)],
                   (gpu_end - gpu_start) * Ny * sizeof(double), cudaMemcpyDeviceToHost);



        double *tmp = u; u = uNew; uNew = tmp;

        cudaMemcpy(d_u, u, N * sizeof(double), cudaMemcpyHostToDevice);
    }

    double t1 = omp_get_wtime();
    double elapsed = t1 - t0;

    double updates = (double)Nt*(Nx-2)*(Ny-2);
    double mlups = updates / elapsed / 1e6;
    FILE *file = fopen("hybrid_heat_distribution.csv", "w");
    for(int i = 0; i < Nx; i++) {
        for(int j = 0; j < Ny; j++) {
            fprintf(file, "%.10e", u[IDX(i,j,Ny)]);
            if(j < Ny-1) fprintf(file, ",");
        }
        fprintf(file, "\n");
    }

    printf("Hybrid OpenMP + CUDA run:\\n");
    printf("  Time           : %.6f s\\n", elapsed);
    printf("  Throughput     : %.2f MLUPS\\n", mlups);
    printf("  u_center (mid) : %f\\n", u[IDX(Nx/2, Ny/2, Ny)]);

    free(u);
    free(uNew);
    cudaFree(d_u);
    cudaFree(d_uNew);

    return 0;
}
"""

In [9]:
with open("Hybrid_heat.cu", "w") as f:
    f.write(Hybrid_code)

In [4]:
# !nvcc -Xcompiler -prec-div -prec-sqrt -fopenmp -lgomp -o hybrid_run Hybrid_heat.cu

nvcc fatal   : '-fopenmp': expected true or false


In [10]:
!nvcc -Xcompiler="-fopenmp" --fmad=false --prec-div=true --prec-sqrt=true -o hybrid_run Hybrid_heat.cu -lgomp

In [11]:
!./hybrid_run

Hybrid OpenMP + CUDA run:\n  Time           : 0.278468 s\n  Throughput     : 140.78 MLUPS\n  u_center (mid) : 0.441778\n

In [6]:
!nvidia-smi  

Thu Jul 17 10:09:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [11]:
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   4
  On-line CPU(s) list:    0-3
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.00GHz
    CPU family:           6
    Model:                85
    Thread(s) per core:   2
    Core(s) per socket:   2
    Socket(s):            1
    Stepping:             3
    BogoMIPS:             4000.34
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht sysc
                          all nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xt
                          opology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq
                           ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
                           aes xsave avx f16c rdrand hypervisor 

In [7]:
rm hybrid_run hybrid_heat_distribution.csv Hybrid_heat.cu