# Word Length Analysis with CUDA and Numba

This notebook demonstrates GPU acceleration using Numba for a simple text analysis task.

Key concepts covered:
- Basic CUDA kernel implementation
- Memory transfer between CPU and GPU
- Atomic operations
- Grid/block configuration

In [ ]:
!pip install numba

In [ ]:
import numpy as np
from numba import cuda, vectorize
import math
from time import time

## CPU Implementation
Basic sequential implementation for comparison

In [ ]:
def count_word_lengths_cpu(words):
    max_length = 20
    counts = np.zeros(max_length, dtype=np.int32)
    for word in words:
        if len(word) < max_length:
            counts[len(word)] += 1
    return counts

## GPU Implementations
Two approaches: vectorized and CUDA kernel

In [ ]:
@vectorize(['int32(int32)'], target='cuda')
def get_word_length(length):
    return min(length, 20)

@cuda.jit
def count_word_lengths_gpu(word_lengths, counts):
    idx = cuda.grid(1)
    if idx < word_lengths.size:
        length = word_lengths[idx]
        if length < 20:
            cuda.atomic.add(counts, length, 1)

## Main Analysis Function

In [ ]:
def analyze_text(text, use_gpu=True):
    words = text.split()
    word_lengths = np.array([len(word) for word in words], dtype=np.int32)
    
    if not use_gpu:
        return count_word_lengths_cpu(words)
    
    d_lengths = cuda.to_device(word_lengths)
    d_counts = cuda.to_device(np.zeros(20, dtype=np.int32))
    
    threadsperblock = 256
    blockspergrid = (word_lengths.size + threadsperblock - 1) // threadsperblock
    
    count_word_lengths_gpu[blockspergrid, threadsperblock](d_lengths, d_counts)
    
    return d_counts.copy_to_host()

## Test and Benchmark

In [ ]:
# Create test data
words = ["python", "cuda", "gpu", "computing"] * 250000
text = " ".join(words)

# CPU benchmark
t0 = time()
cpu_counts = analyze_text(text, use_gpu=False)
cpu_time = time() - t0

# GPU benchmark
t0 = time()
gpu_counts = analyze_text(text, use_gpu=True)
gpu_time = time() - t0

print(f"CPU time: {cpu_time:.4f}s")
print(f"GPU time: {gpu_time:.4f}s")
print(f"Speedup: {cpu_time/gpu_time:.2f}x")

# Compare results
print("\nWord length frequencies:")
for length, count in enumerate(gpu_counts):
    if count > 0:
        print(f"Length {length}: {count}")