SP23-BAI-046
Noor Fatima

**imports & GPU check**


In [2]:

import numpy as np
import time
from numba import njit, prange
from numba import cuda
from PIL import Image
import io
import sys

print("Numba:", np.__version__)
print("CUDA available:", cuda.is_available())
if cuda.is_available():
    dev = cuda.get_current_device()
    print("CUDA device name:", dev.name)
    print("Compute capability:", dev.compute_capability)
    print("Max threads per block:", dev.MAX_THREADS_PER_BLOCK)
else:
    print("No CUDA device found. Please enable GPU in Colab: Runtime -> Change runtime type -> GPU")


Numba: 2.0.2
CUDA available: True
CUDA device name: b'Tesla T4'
Compute capability: (7, 5)
Max threads per block: 1024


**Hello GPU (threads / blocks / grids)**

In [5]:
import cupy as cp

# Kernel writes each thread's global id into the output array
hello_kernel = cp.ElementwiseKernel(
    '',
    'int32 tid',
    'tid = i;',  # i is the built-in thread/global index for elementwise
    'hello_kernel'
)

n = 16
out = cp.zeros(n, dtype=cp.int32)
hello_kernel(out)

# Bring results back to host
out_host = cp.asnumpy(out)
for idx, tid in enumerate(out_host):
    print(f"Hello from thread {tid}")


Hello from thread 0
Hello from thread 1
Hello from thread 2
Hello from thread 3
Hello from thread 4
Hello from thread 5
Hello from thread 6
Hello from thread 7
Hello from thread 8
Hello from thread 9
Hello from thread 10
Hello from thread 11
Hello from thread 12
Hello from thread 13
Hello from thread 14
Hello from thread 15


**Vector Addition (CPU vs GPU with CuPy)**

In [7]:
import numpy as np
import cupy as cp
import time

# Problem size
N = 10_000_000
print("Allocating arrays of length:", N)

# CPU arrays
a_cpu = np.random.rand(N).astype(np.float32)
b_cpu = np.random.rand(N).astype(np.float32)

# ---------------- CPU baseline ----------------
t0 = time.perf_counter()
out_cpu = a_cpu + b_cpu   # NumPy vectorized (uses optimized C under the hood)
t1 = time.perf_counter()
cpu_time = t1 - t0
print(f"CPU (NumPy) time: {cpu_time:.4f} s")

# ---------------- GPU (CuPy) ----------------
a_gpu = cp.asarray(a_cpu)
b_gpu = cp.asarray(b_cpu)

# warm-up
_ = a_gpu + b_gpu
cp.cuda.Stream.null.synchronize()

# kernel timing only (data already on GPU)
t0 = time.perf_counter()
out_gpu = a_gpu + b_gpu
cp.cuda.Stream.null.synchronize()
t1 = time.perf_counter()
gpu_kernel_time = t1 - t0
print(f"GPU kernel time (CuPy): {gpu_kernel_time:.4f} s")

# end-to-end timing (including H2D + compute + D2H)
t0 = time.perf_counter()
out_gpu_full = cp.asnumpy(cp.asarray(a_cpu) + cp.asarray(b_cpu))
t1 = time.perf_counter()
gpu_total_time = t1 - t0
print(f"Total GPU time (incl transfers): {gpu_total_time:.4f} s")

# ---------------- Speedup ----------------
speedup_kernel = cpu_time / gpu_kernel_time if gpu_kernel_time > 0 else np.inf
speedup_total  = cpu_time / gpu_total_time if gpu_total_time > 0 else np.inf
print(f"Speedup (CPU / GPU kernel-only): {speedup_kernel:.2f}x")
print(f"Speedup (CPU / GPU end-to-end): {speedup_total:.2f}x")

# ---------------- Verify correctness ----------------
print("Results identical:", np.allclose(out_cpu, cp.asnumpy(out_gpu), atol=1e-6))


Allocating arrays of length: 10000000
CPU (NumPy) time: 0.0156 s
GPU kernel time (CuPy): 0.0009 s
Total GPU time (incl transfers): 0.0428 s
Speedup (CPU / GPU kernel-only): 17.90x
Speedup (CPU / GPU end-to-end): 0.36x
Results identical: True


**Image Inversion (CPU vs GPU with CuPy)**

In [8]:
from PIL import Image
import numpy as np
import cupy as cp
import time
import os

# Try to load an uploaded image; else fallback to gradient
try:
    fname = "sample.png"  # change this if you upload your own
    if not os.path.exists(fname):
        raise FileNotFoundError
except:
    # fallback: create gradient image
    W, H = 2048, 1024
    arr = np.zeros((H, W, 3), dtype=np.uint8)
    for y in range(H):
        arr[y, :, :] = np.linspace(0, 255, W, dtype=np.uint8)[:, None].repeat(3, axis=1)
    img = Image.fromarray(arr)
    fname = "sample_gradient.png"
    img.save(fname)
    print("Saved fallback image:", fname)

# Load image
img = Image.open(fname).convert("RGB")
arr = np.array(img)  # shape (H, W, 3), dtype=uint8
print("Image shape:", arr.shape)

# ---------------- CPU inversion ----------------
t0 = time.perf_counter()
inv_cpu = 255 - arr
t1 = time.perf_counter()
cpu_time_img = t1 - t0
print(f"CPU invert time: {cpu_time_img:.4f} s")

# ---------------- GPU inversion ----------------
arr_gpu = cp.asarray(arr)

t0 = time.perf_counter()
inv_gpu = 255 - arr_gpu
cp.cuda.Stream.null.synchronize()
t1 = time.perf_counter()
gpu_time_img = t1 - t0
print(f"GPU invert time: {gpu_time_img:.4f} s")

# Copy result back
inv_gpu_host = cp.asnumpy(inv_gpu)

# ---------------- Verify ----------------
identical = np.array_equal(inv_cpu, inv_gpu_host)
print("Outputs identical:", identical)

# Save results
Image.fromarray(inv_cpu).save("inverted_cpu.png")
Image.fromarray(inv_gpu_host).save("inverted_gpu.png")
print("Saved inverted_cpu.png and inverted_gpu.png")


Saved fallback image: sample_gradient.png
Image shape: (1024, 2048, 3)
CPU invert time: 0.0029 s
GPU invert time: 0.1445 s
Outputs identical: True
Saved inverted_cpu.png and inverted_gpu.png
