In [1]:
import pyopencl as cl
import numpy as np
import time

# ============================================================
# 1. GPU KERNEL (OpenCL C code - runs on GPU)
# ============================================================
KERNEL_CODE = """
__kernel void series_sum(
    __global const float *a_array,      // Input series A
    __global const float *b_array,      // Input series B
    __global float *result              // Output (sum of each element)
)
{
    // get_global_id(0) = which element this GPU thread handles
    int gid = get_global_id(0);
    
    // Each GPU thread computes ONE element
    result[gid] = a_array[gid] + b_array[gid];
}
"""

# ============================================================
# 2. SETUP GPU CONTEXT & QUEUE
# ============================================================
platforms = cl.get_platforms()
devices = platforms[0].get_devices(device_type=cl.device_type.GPU)
gpu_device = devices[0]  # RX 6500M

ctx = cl.Context([gpu_device])
queue = cl.CommandQueue(ctx)

# Compile kernel
program = cl.Program(ctx, KERNEL_CODE).build()
kernel = program.series_sum

# ============================================================
# 3. YOUR DATA MODEL - STRAWS
# ============================================================
STRAW_SIZE = 1000          # 1000 units per straw
BLOCK_SIZE = 1_000_000     # 1 million elements per block (1000 straws)
TOTAL_SIZE = 1_000_000_000 # 1 billion (but we'll demo with smaller)

# For demo, use smaller size
TOTAL_SIZE = 10_000_000  # 10 million instead of 1 billion

# Generate straw boundaries (NOT the data, just ranges)
straws_list = []
for i in range(0, TOTAL_SIZE, STRAW_SIZE):
    start = i
    end = min(i + STRAW_SIZE, TOTAL_SIZE)
    straws_list.append((start, end))

print(f"Total straws: {len(straws_list)}")
print(f"First 3 straws: {straws_list[:3]}")
print(f"Last 3 straws: {straws_list[-3:]}\n")

# ============================================================
# 4. STREAMING COMPUTATION - PROCESS ONE STRAW AT A TIME
# ============================================================

def series_func_a(x):
    """Your series A formula"""
    return x ** 2

def series_func_b(x):
    """Your series B formula"""
    return 2 * x

# Prepare GPU buffers
mf = cl.mem_flags
total_sum = 0
straw_results = []

print("Processing straws on GPU...\n")
start_time = time.time()

for straw_idx, (start, end) in enumerate(straws_list[:10]):  # Process first 10 straws for demo
    straw_size = end - start
    
    # ============================================================
    # STEP 1: Generate data for THIS straw on CPU
    # ============================================================
    x = np.arange(start, end, dtype=np.float32)
    a_data = series_func_a(x)
    b_data = series_func_b(x)
    
    # ============================================================
    # STEP 2: CREATE GPU BUFFERS (allocate GPU memory)
    # ============================================================
    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_data)
    b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_data)
    result_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a_data.nbytes)
    
    # ============================================================
    # STEP 3: RUN KERNEL ON GPU
    # ============================================================
    # (straw_size,) means spawn 'straw_size' GPU threads
    kernel(queue, (straw_size,), None, a_buf, b_buf, result_buf)
    queue.finish()  # Wait for GPU to finish
    
    # ============================================================
    # STEP 4: READ RESULT BACK FROM GPU TO CPU
    # ============================================================
    result = np.empty_like(a_data)
    cl.enqueue_copy(queue, result, result_buf)
    
    # ============================================================
    # STEP 5: ACCUMULATE RESULT (streaming)
    # ============================================================
    straw_sum = np.sum(result)
    total_sum += straw_sum
    straw_results.append(straw_sum)
    
    # ============================================================
    # CLEANUP: Release GPU buffers (CRITICAL!)
    # ============================================================
    a_buf.release()
    b_buf.release()
    result_buf.release()
    
    if (straw_idx + 1) % 2 == 0:
        print(f"  Processed {straw_idx + 1} straws... (Total sum so far: {total_sum:.2e})")

elapsed = time.time() - start_time
print(f"\n✅ GPU Processing Time: {elapsed:.4f} seconds")
print(f"Total Sum: {total_sum:.2e}")
print(f"Average per straw: {np.mean(straw_results):.2e}")


Total straws: 10000
First 3 straws: [(0, 1000), (1000, 2000), (2000, 3000)]
Last 3 straws: [(9997000, 9998000), (9998000, 9999000), (9999000, 10000000)]

Processing straws on GPU...

  Processed 2 straws... (Total sum so far: 2.67e+09)
  Processed 4 straws... (Total sum so far: 2.13e+10)
  Processed 6 straws... (Total sum so far: 7.20e+10)
  Processed 8 straws... (Total sum so far: 1.71e+11)
  Processed 10 straws... (Total sum so far: 3.33e+11)

✅ GPU Processing Time: 0.0068 seconds
Total Sum: 3.33e+11
Average per straw: 3.33e+10


  _create_built_program_from_source_cached(
