In [7]:
pip install mpi4py numba numpy

Note: you may need to restart the kernel to use updated packages.


In [8]:
from mpi4py import MPI
import numpy as np
from numba import cuda

In [3]:
from mpi4py import MPI
print("MPI test successful")


MPI test successful


In [2]:
from numba import cuda

print(cuda.detect())

Found 1 CUDA devices
id 0    b'NVIDIA GeForce RTX 4060 Laptop GPU'                              [SUPPORTED]
                      Compute Capability: 8.9
                           PCI Device ID: 0
                              PCI Bus ID: 1
                                    UUID: GPU-1ab98dcd-76e5-ee1b-ec1c-c1efb90cdc10
                                Watchdog: Enabled
                            Compute Mode: WDDM
             FP32/FP64 Performance Ratio: 64
Summary:
	1/1 devices are supported
True


In [3]:
from numba import cuda
print(cuda.gpus)

<Managed Device 0>


In [9]:
@cuda.jit
def game_of_life_kernel(current_grid, next_grid, rows, cols):
    # Get thread's absolute position within the grid
    x, y = cuda.grid(2)

    if x >= rows or y >= cols:
        return  # Out of bounds

    # Count live neighbors
    live_neighbors = 0
    for i in range(-1, 2):
        for j in range(-1, 2):
            if i == 0 and j == 0:
                continue
            neighbor_x = (x + i + rows) % rows
            neighbor_y = (y + j + cols) % cols
            live_neighbors += current_grid[neighbor_x, neighbor_y]

    # Apply rules of Game of Life
    if current_grid[x, y] == 1:
        if live_neighbors < 2 or live_neighbors > 3:
            next_grid[x, y] = 0
        else:
            next_grid[x, y] = 1
    else:
        if live_neighbors == 3:
            next_grid[x, y] = 1
        else:
            next_grid[x, y] = 0


In [10]:
def mpi_game_of_life(grid_size, num_generations, proc_grid_size):
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    rows, cols = grid_size

    # Each process will work on a subgrid
    local_rows = rows // proc_grid_size
    local_grid = np.random.randint(2, size=(local_rows, cols))

    # Allocate space for the next grid state
    next_grid = np.zeros_like(local_grid)

    # Setup GPU grid and thread dimensions
    threads_per_block = (16, 16)
    blocks_per_grid_x = (local_rows + threads_per_block[0] - 1) // threads_per_block[0]
    blocks_per_grid_y = (cols + threads_per_block[1] - 1) // threads_per_block[1]
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

    for _ in range(num_generations):
        # Copy data to GPU
        d_local_grid = cuda.to_device(local_grid)
        d_next_grid = cuda.to_device(next_grid)

        # Call kernel
        game_of_life_kernel[blocks_per_grid, threads_per_block](d_local_grid, d_next_grid, local_rows, cols)

        # Copy result back to host
        d_next_grid.copy_to_host(next_grid)

        # Exchange rows between neighboring processes
        if rank > 0:
            comm.Sendrecv(local_grid[0, :], dest=rank - 1, sendtag=11,
                          recvbuf=local_grid[-1, :], source=rank - 1, recvtag=11)
        if rank < size - 1:
            comm.Sendrecv(local_grid[-1, :], dest=rank + 1, sendtag=12,
                          recvbuf=local_grid[0, :], source=rank + 1, recvtag=12)

        # Update local grid with next generation
        local_grid = next_grid.copy()

    # Gather final grid at root process (rank 0)
    full_grid = None
    if rank == 0:
        full_grid = np.zeros((rows, cols))
    comm.Gather(local_grid, full_grid, root=0)

    return full_grid


In [11]:
def main():
    grid_size = (128, 128)  # Example grid size
    num_generations = 100   # Example number of generations
    proc_grid_size = MPI.COMM_WORLD.Get_size()  # Number of processes
    final_grid = mpi_game_of_life(grid_size, num_generations, proc_grid_size)

    if MPI.COMM_WORLD.Get_rank() == 0:
        print("Final grid after {} generations:".format(num_generations))
        print(final_grid)

if __name__ == "__main__":
    main()



Final grid after 100 generations:
[[0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000
  0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000
  0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 4.94065646e-324
  2.12199579e-314 0.00000000e+000]
 ...
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000
  0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000
  0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000
  0.00000000e+000 0.00000000e+000]]
