# Week 04 Live Coding Demo — NumPy, Number Formats, Big-O, Parallel Computing
Guiding ideas: work with **arrays**, avoid Python loops where possible, and understand how **precision** and **algorithmic complexity** impact results and performance.

## 1. Array Creation — beyond the basics

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# --- grids ---
L = 0.04                                   # spatial domain length [m]
Nx, Nt = 256, 200                          # grid sizes
x = np.linspace(-L/2, L/2, Nx)             # x in [-L/2, L/2]
T = 0.02                                   
t = np.linspace(0.0, T, Nt)                # time in [0, T]

# --- wave parameters ---
k = 2*np.pi / L                            # wavenumber [rad/m]
omega = 2*np.pi * 50                       # 50 Hz angular freq [rad/s]

# --- separable space–time field: sin(k x) * cos(ω t) ---
spatial = np.sin(k * x)                    # shape (Nx,)
temporal = np.cos(omega * t)               # shape (Nt,)
field = temporal[:, None] * spatial[None, :]  # broadcast -> shape (Nt, Nx)

print(field.shape)                         # (200, 256)
print(np.round(field[:3, :5], 3))          # first 3 times × first 5 x nodes
# example ->
# [[-0.   -0.078 -0.156 -0.232 -0.306]
#  [-0.   -0.071 -0.144 -0.215 -0.283]
#  [-0.   -0.058 -0.119 -0.178 -0.234]]

# optional: snapshot at t0
t0 = 0.003                                 # 3 ms
i0 = np.argmin(np.abs(t - t0))             # nearest time index
snapshot = field[i0]                       # 1D spatial profile at t ≈ t0
print('Snapshot at t0:', snapshot)

plt.imshow(field, aspect='auto', cmap='bwr')
plt.xlabel('x [m]')
plt.ylabel('t [s]')
plt.colorbar()
plt.show()


In [None]:
A = np.triu(np.ones((4,4)))
print(A)
B = np.block([[A, 2*A],[3*A, 4*A]])
print(B)
print(B.shape)
print(B[:3,:6])


## 2. Indexing & Slicing — 1D/2D/3D practice

In [None]:
sig = np.sin(np.linspace(0, 4*np.pi, 13))
ds = (sig[2:] - sig[:-2]) / 2.0
print(np.round(sig, 3))
print(np.round(ds, 3))


In [None]:
# 2D indexing, img is 10x12 pixels
img = np.arange(10*12).reshape(10,12) 
crop = img[2:8, 3:9]
coarse = img[::2, ::3]
flipped = img[::-1, ::-1]
print(crop.shape, coarse.shape, flipped.shape)
print(crop[:2,:4])


In [None]:
nx, ny, nz = 6, 5, 4
F = np.arange(nx*ny*nz).reshape(nx, ny, nz)
slice_yz = F[3, :, :]
print(F.shape, slice_yz.shape)
print(slice_yz)


## 3. Masks & Fancy Indexing — threshold and selection

In [None]:
# mask and fancy indexing returns a copy
E = np.array([0.05, 0.2, 0.8, 1.6, 2.5, 3.1])
mask_pass = (E >= 0.5) & (E <= 2.0)
print(mask_pass)
passed = E[mask_pass]
ix = [1, 3, 5]
picked = E[ix]
print(passed)
print(picked)


In [None]:
# putmask returns a copy, ~mask_pass is a boolean array
window = E.copy()
np.putmask(window, ~mask_pass, 0.0)
print(window)


## 4. Broadcasting & Vectorized Ops — small physics model

In [None]:
# broadcasting and vectorized operations
# B is Planck function for blackbody radiation, T is temperature, lam is wavelength
lam = np.linspace(300e-9, 1200e-9, 19)
T = np.array([3000.0, 6000.0])
c2 = 1.438776877e-2
B = 1.0 / (lam[None,:]**5 * (np.exp(c2/(lam[None,:]*T[:,None])) - 1.0))
print(B.shape)
print(np.round(B[:, ::2] / B[:, ::2].max(axis=1, keepdims=True), 3))
plt.plot(lam, B[0], label='T=3000 K')
plt.plot(lam, B[1], label='T=6000 K')
plt.xlabel('Wavelength [m]')
plt.ylabel('Spectral radiance [W/m^2/sr/m]')
plt.legend()
plt.show()


In [None]:
# damped harmonic oscillator
t = np.linspace(0, 2.0, 100)
f0 = 2.5
zeta = 0.2
y = np.exp(-zeta*2*np.pi*f0*t) * np.cos(2*np.pi*f0*np.sqrt(1-zeta**2)*t)
print(np.round(y, 4))
plt.plot(t, y)
plt.xlabel('Time [s]')
plt.ylabel('Displacement [m]')
plt.show()


## 5. Reductions & Axis — summarize along dimensions

In [None]:
A = np.arange(2*3*4).reshape(2,3,4)
col_sum = A.sum(axis=2)
row_max = A.max(axis=1, keepdims=True)
print(col_sum.shape, row_max.shape)
print(col_sum[0], row_max[0,0])


In [None]:
# +-2 sigma of a Gaussian with mean 0 and std 0.5 covers ~95.5% of the distribution
noise = np.random.RandomState(0).normal(0, 0.5, size=(4, 1000))
counts = (np.abs(noise) > 1.0).sum(axis=1) # True/False -> 1/0 -> sum over axis=1
print(counts)


## 6. Performance — Python loop vs NumPy vectorization

In [None]:
import time
N = 3_000_000
t0 = time.perf_counter()
s = 0
for k in range(N):
    s += k*k
t1 = time.perf_counter()
print("Python sumsq:", s, "time(s):", round(t1 - t0, 4))

import numpy as np
t2 = time.perf_counter()
arr = np.arange(N, dtype=np.int64)
s_np = int((arr*arr).sum())
t3 = time.perf_counter()
print("NumPy sumsq :", s_np, "time(s):", round(t3 - t2, 4))


## 7. Number Formats — precision (ε) and dynamic range

In [None]:
import numpy as np
print("eps16:", np.finfo(np.float16).eps)
print("eps32:", np.finfo(np.float32).eps)
print("eps64:", np.finfo(np.float64).eps)
print("tiny32:", np.finfo(np.float32).tiny, "max32:", np.finfo(np.float32).max)
print("tiny64:", np.finfo(np.float64).tiny, "max64:", np.finfo(np.float64).max)


In [None]:
x = np.float32(100.0)  # 32-bit float scalar

# Compute exp(±100) in float32 to illustrate limited dynamic range
y = np.float32(np.exp(np.float32(100)))   # exp(100) overflows float32 → inf (RuntimeWarning)
z = np.float32(np.exp(np.float32(-100)))  # exp(-100) underflows to a subnormal (~3.8e-44)

print("exp32(100):", y, "exp32(-100):", z)

# In float64, exp(100) is finite and representable
x64 = 100.0  # default dtype is float64
print("exp64(100):", np.exp(x64), "exp64(-100):", np.exp(-1 * x64))


In [None]:
# sum of 0.1 in float32 vs float64
n = 100_000
a32 = np.float32(0.1)
s32 = np.float32(0.0)
for _ in range(n):
    s32 += a32
a64 = np.float64(0.1)
s64 = np.float64(0.0)
for _ in range(n):
    s64 += a64
print("sum32:", s32)
print("sum64:", s64)


## 8. Modern formats — FP16, bfloat16, and TF32 (emulated)

In [None]:
import numpy as np
h = np.array([1.0, 1e-3, 1e3], dtype=np.float16)
print("float16:", h, h.dtype)
try:
    bf = np.array([1.0, 1e-3, 1e3], dtype=np.dtype('bfloat16'))
    print("bfloat16 available:", True, bf.dtype)
except Exception as e:
    print("bfloat16 available:", False, e)


In [None]:
# TF32 is a modern format that is not natively supported by NumPy
# but can be emulated using bit manipulation
import numpy as np

def tf32_round(x32, keep=10):
    a = np.asarray(x32, dtype=np.float32)            # ensure float32
    ui = a.view(np.uint32)                           # bit view of the same memory

    drop = np.uint32(23 - keep)                      # how many mantissa bits to drop
    if drop == 0:                                    # nothing to round
        return a.copy()

    round_bit = np.uint32(1) << np.uint32(drop - 1)  # half-ulp at the cut (uint32)
    mask = np.uint32(0xFFFFFFFF) << drop             # keep high bits, zero low bits (uint32)

    rounded = (ui + round_bit) & mask                # add-round and clear low bits
    return rounded.view(np.float32)                  # back to float32

# quick check
vals = np.array([0.12345679, 123.5679, -0.9876543], dtype=np.float32)
approx = tf32_round(vals)
print("float32:", vals)       # float32: [ 1.2345679e-01  1.2356790e+02 -9.8765433e-01]
print("TF32~  :", approx)     # TF32~  : [ 1.2347412e-01  1.2356250e+02 -9.8779297e-01]
print("abs err:", np.abs(vals - approx))  # abs err: [1.7330050e-05 5.4016113e-03 1.3864040e-04]



## 9. Memory Footprint — dtype and nbytes

In [None]:
N = 2_000_000
a32 = np.ones(N, dtype=np.float32)
a64 = np.ones(N, dtype=np.float64)
print("a32 bytes:", a32.nbytes, "a64 bytes:", a64.nbytes)
print("C-contiguous?", a32.flags['C_CONTIGUOUS'], a64.flags['C_CONTIGUOUS'])


## 10. Integer Overflow — wrap-around warning

In [None]:
x = np.array([2**31 - 1], dtype=np.int32)
print("before:", x[0])
print("after :", (x + 1)[0])


## 11. Big-O timing sketches — scaling behavior

In [None]:
import time, math, numpy as np
def time_sum(N):
    x = np.arange(N, dtype=np.float64)
    t0 = time.perf_counter(); _ = x.sum(); t1 = time.perf_counter()
    return t1 - t0
def time_sort(N):
    x = np.random.RandomState(0).rand(N)
    t0 = time.perf_counter(); _ = np.sort(x); t1 = time.perf_counter()
    return t1 - t0
def time_pairs(N):
    pts = np.random.RandomState(0).rand(N, 2)
    t0 = time.perf_counter()
    D2 = ((pts[:,None,:] - pts[None,:,:])**2).sum(axis=2)
    t1 = time.perf_counter()
    return t1 - t0, D2.shape
N_list = [1_000, 10_000, 100_000]
for N in N_list:
    print("O(N) sum   ", N, "sec:", round(time_sum(N), 5))
for N in N_list:
    print("O(N log N) ", N, "sec:", round(time_sort(N), 5))
for N in N_list:
    dt, shape = time_pairs(N)
    print("O(N^2) pairs", N, shape, "sec:", round(dt, 5))


## 12. Parallel Computing — matrix multiply may use threaded BLAS

In [None]:
import numpy as np, time
n = 800
A = np.random.RandomState(0).rand(n, n)
B = np.random.RandomState(1).rand(n, n)
t0 = time.perf_counter()
C = A @ B
t1 = time.perf_counter()
print("C shape:", C.shape, "time(s):", round(t1 - t0, 3))


Three quick ways to check (and prove) whether your A @ B used multiple threads: 'num_threads'>1?, cpu/wall>1?, slow down with thread limit=1?

In [None]:
import numpy as np, time
from threadpoolctl import threadpool_info, threadpool_limits  # pip install threadpoolctl

# --- helper: do one matmul and return wall and CPU times ---
def run_once(n=3000):
    rng = np.random.RandomState(0); A = rng.rand(n, n); B = rng.rand(n, n)  # contiguous, BLAS-friendly
    t0 = time.perf_counter(); c0 = time.process_time()                      # wall vs process CPU time
    A @ B                                                                    # NumPy calls GEMM in BLAS
    t1 = time.perf_counter(); c1 = time.process_time()
    return t1 - t0, c1 - c0

# (1) Inspect active threadpools
print(threadpool_info())  # shows MKL/OpenBLAS/Accelerate and num_threads

# (2) Measure wall vs CPU time (CPU > wall ⇒ parallel native threads)
_ = run_once()                         # warm-up
wall, cpu = run_once()
print(f"wall={wall:.3f}s  cpu={cpu:.3f}s  cpu/wall≈{cpu/wall:.1f}x")  # >1 means multi-threaded BLAS

# (3) Force 1 thread and compare (proof by experiment)
with threadpool_limits(limits=1):      # set MKL/OPENBLAS/Accelerate threads to 1 for this block
    wall1, cpu1 = run_once()
print(f"1-thread wall={wall1:.3f}s  speedup≈{wall1/wall:.1f}x vs default")  # >1 confirms parallelism before


## 13. Vectorize vs np.vectorize — convenience vs speed

In [None]:
import numpy as np, time
x = np.linspace(0, 1, 400_000)
f_loop = lambda arr: [t**0.5 for t in arr]
f_true = lambda arr: arr**0.5
f_wrap = np.vectorize(lambda t: t**0.5)
for name, func in [('Python loop', f_loop), ('True ufunc', f_true), ('np.vectorize', f_wrap)]:
    t0 = time.perf_counter(); _ = func(x); dt = time.perf_counter() - t0
    print(f"{name:13s} {dt: .4f} s")
