# Signal Processing with CUDA and Numba

This notebook demonstrates GPU acceleration for signal processing using Numba:
- FFT-like operations on time series data
- Memory coalescing optimization
- Performance comparison between CPU and GPU implementations
- Visualization of results

In [ ]:
# Install required packages
!pip install numba seaborn

# Import libraries
import numpy as np
from numba import cuda, vectorize
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import pandas as pd
import math

## Signal Generation
Generate synthetic test signal with multiple frequency components

In [ ]:
def generate_signal(n_samples, frequencies=[10, 25, 50]):
    """Generate test signal with multiple frequencies and noise"""
    t = np.linspace(0, 1, n_samples)
    signal = np.zeros_like(t)
    for f in frequencies:
        signal += np.sin(2 * np.pi * f * t)
    return t, signal + np.random.normal(0, 0.1, n_samples)

# Test signal generation
t_test, signal_test = generate_signal(1000)
plt.plot(t_test[:100], signal_test[:100])
plt.title('Example Signal (first 100 samples)')
plt.show()

## GPU Implementations
1. Vectorized window function - Simple element-wise operation
2. Custom FFT kernel - Demonstrates memory coalescing

In [ ]:
@vectorize(['complex64(float32)'], target='cuda')
def apply_window(x):
    """Hanning window function - vectorized for GPU"""
    return 0.5 * (1 - np.cos(2 * np.pi * x))

@cuda.jit
def compute_fft_elements(signal, output):
    """Custom FFT-like operation with memory coalescing"""
    idx = cuda.grid(1)
    if idx < signal.size:
        # Sequential memory access pattern
        sum_real = 0.0
        sum_imag = 0.0
        for k in range(signal.size):
            angle = -2.0 * np.pi * idx * k / signal.size
            sum_real += signal[k] * math.cos(angle)
            sum_imag += signal[k] * math.sin(angle)
        output[idx] = (sum_real * sum_real + sum_imag * sum_imag) ** 0.5

## Analysis Function
Wrapper function to handle CPU/GPU implementation selection and memory management

In [ ]:
def analyze_signal(signal, use_gpu=True):
    """Analyze signal using either CPU or GPU implementation"""
    if not use_gpu:
        return np.fft.fft(signal)
    
    # GPU memory management
    signal_gpu = cuda.to_device(signal.astype(np.float32))
    output_gpu = cuda.to_device(np.zeros_like(signal, dtype=np.float32))
    
    # Configure kernel execution
    threadsperblock = 256
    blockspergrid = (signal.size + threadsperblock - 1) // threadsperblock
    
    # Launch kernel
    compute_fft_elements[blockspergrid, threadsperblock](signal_gpu, output_gpu)
    
    return output_gpu.copy_to_host()

## Benchmarking and Visualization

In [ ]:
def run_benchmark():
    """Run performance comparison across different signal sizes"""
    sizes = [1024, 4096, 16384, 65536]
    results = []
    
    for size in sizes:
        t, signal = generate_signal(size)
        
        # CPU timing
        t0 = time()
        cpu_result = analyze_signal(signal, use_gpu=False)
        cpu_time = time() - t0
        
        # GPU timing
        t0 = time()
        gpu_result = analyze_signal(signal, use_gpu=True)
        gpu_time = time() - t0
        
        results.append({
            'size': size,
            'cpu_time': cpu_time,
            'gpu_time': gpu_time,
            'speedup': cpu_time/gpu_time,
            'signal_size': size
        })
        
        print(f"Size {size}: CPU {cpu_time:.4f}s, GPU {gpu_time:.4f}s, Speedup {cpu_time/gpu_time:.2f}x")
    
    return results, t, signal, cpu_result, gpu_result

def plot_results(results, t, signal, cpu_result, gpu_result):
    """Create visualizations of signal analysis and performance"""
    sns.set_style("whitegrid")
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Original signal plot
    axes[0,0].plot(t[:100], signal[:100], 'b-', label='Signal')
    axes[0,0].set_title('Original Signal (first 100 samples)')
    axes[0,0].set_xlabel('Time')
    axes[0,0].set_ylabel('Amplitude')
    
    # Frequency domain comparison
    freqs = np.fft.fftfreq(len(signal))
    axes[0,1].plot(freqs[:len(freqs)//2], np.abs(cpu_result)[:len(cpu_result)//2], 
                   'b-', alpha=0.7, label='CPU FFT')
    axes[0,1].plot(freqs[:len(freqs)//2], np.abs(gpu_result)[:len(gpu_result)//2], 
                   'r--', alpha=0.7, label='GPU FFT')
    axes[0,1].set_title('Frequency Domain Comparison')
    axes[0,1].set_xlabel('Frequency')
    axes[0,1].set_ylabel('Magnitude')
    axes[0,1].legend()
    
    # Performance scaling
    sizes = [r['signal_size'] for r in results]
    speedups = [r['speedup'] for r in results]
    axes[1,0].plot(sizes, speedups, 'go-')
    axes[1,0].set_xscale('log')
    axes[1,0].set_xlabel('Signal Size')
    axes[1,0].set_ylabel('Speedup (CPU/GPU)')
    axes[1,0].set_title('GPU Speedup vs Signal Size')
    
    # Time comparison
    times = pd.DataFrame({
        'Size': [str(r['signal_size']) for r in results],
        'CPU': [r['cpu_time'] for r in results],
        'GPU': [r['gpu_time'] for r in results]
    }).melt(id_vars=['Size'], var_name='Platform', value_name='Time')
    
    sns.barplot(data=times, x='Size', y='Time', hue='Platform', ax=axes[1,1])
    axes[1,1].set_title('Execution Time Comparison')
    axes[1,1].set_ylabel('Time (seconds)')
    
    plt.tight_layout()
    plt.show()

In [ ]:
# Run analysis and create visualizations
results, t, signal, cpu_result, gpu_result = run_benchmark()
plot_results(results, t, signal, cpu_result, gpu_result)