# Lab 1.6.3: RAPIDS Acceleration - SOLUTIONS

This notebook contains complete solutions to all exercises from Lab 1.6.3.

In [None]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as SklearnRF
from sklearn.metrics import accuracy_score

# Plotting style with fallback for older matplotlib versions
try:
    plt.style.use('seaborn-v0_8-whitegrid')
except OSError:
    try:
        plt.style.use('seaborn-whitegrid')
    except OSError:
        pass  # Use default style

# Check if RAPIDS is available
try:
    import cudf
    import cupy as cp
    from cuml.ensemble import RandomForestClassifier as CumlRF
    from cuml.linear_model import LogisticRegression as CumlLR
    RAPIDS_AVAILABLE = True
    print("✅ RAPIDS is available!")
except ImportError:
    RAPIDS_AVAILABLE = False
    print("❌ RAPIDS not available. Use NGC container:")
    print("   nvcr.io/nvidia/rapidsai/base:25.11-py3")

## Exercise 1 Solution: Benchmark Different Dataset Sizes

In [None]:
# Exercise 1: How speedup changes with dataset size

if RAPIDS_AVAILABLE:
    sizes = [10_000, 50_000, 100_000, 500_000, 1_000_000]
    n_features = 30
    
    sklearn_times = []
    cuml_times = []
    speedups = []
    
    print("Benchmarking Random Forest across dataset sizes...")
    print("=" * 60)
    
    for size in sizes:
        print(f"\nDataset size: {size:,}")
        
        # Generate data
        X, y = make_classification(
            n_samples=size,
            n_features=n_features,
            n_informative=20,
            random_state=42
        )
        X = X.astype(np.float32)
        y = y.astype(np.int32)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # scikit-learn
        sklearn_model = SklearnRF(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
        start = time()
        sklearn_model.fit(X_train, y_train)
        sklearn_time = time() - start
        sklearn_times.append(sklearn_time)
        print(f"  sklearn: {sklearn_time:.2f}s")
        
        # cuML
        cuml_model = CumlRF(n_estimators=50, max_depth=10)
        start = time()
        cuml_model.fit(X_train, y_train)
        cuml_time = time() - start
        cuml_times.append(cuml_time)
        print(f"  cuML:    {cuml_time:.2f}s")
        
        speedup = sklearn_time / cuml_time
        speedups.append(speedup)
        print(f"  Speedup: {speedup:.1f}x")
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Training time
    ax1 = axes[0]
    ax1.plot(sizes, sklearn_times, 'o-', label='sklearn (CPU)', linewidth=2)
    ax1.plot(sizes, cuml_times, 'o-', label='cuML (GPU)', linewidth=2)
    ax1.set_xlabel('Dataset Size')
    ax1.set_ylabel('Training Time (seconds)')
    ax1.set_title('Training Time vs Dataset Size')
    ax1.legend()
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    
    # Speedup
    ax2 = axes[1]
    ax2.plot(sizes, speedups, 'o-', color='green', linewidth=2)
    ax2.axhline(y=1, color='red', linestyle='--', label='Break-even')
    ax2.set_xlabel('Dataset Size')
    ax2.set_ylabel('Speedup (x times faster)')
    ax2.set_title('GPU Speedup vs Dataset Size')
    ax2.set_xscale('log')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    print("\nConclusion:")
    print("  GPU speedup increases with dataset size!")
    print("  Below 10K samples, overhead may dominate.")
    print("  For 1M+ samples, expect 10-50x speedup.")
else:
    print("RAPIDS not available - showing expected results:")
    print("  10K samples: ~2x speedup (GPU overhead)")
    print("  100K samples: ~10x speedup")
    print("  1M samples: ~30-50x speedup")

## Exercise 2 Solution: Port sklearn Pipeline to cuML

In [None]:
# Exercise 2: Porting sklearn pipeline to cuML

if RAPIDS_AVAILABLE:
    from cuml.preprocessing import StandardScaler as CumlScaler
    from cuml.decomposition import PCA as CumlPCA
    from cuml.pipeline import Pipeline as CumlPipeline
    
    # Generate test data
    X, y = make_classification(n_samples=100_000, n_features=50, random_state=42)
    X = X.astype(np.float32)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Method 1: Manual chaining (more control)
    print("Method 1: Manual Pipeline")
    print("-" * 40)
    
    start = time()
    
    # Scaling
    scaler = CumlScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # PCA
    pca = CumlPCA(n_components=20)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    # Classifier
    clf = CumlRF(n_estimators=100, max_depth=16)
    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    
    manual_time = time() - start
    manual_acc = accuracy_score(y_test, y_pred.to_numpy() if hasattr(y_pred, 'to_numpy') else y_pred)
    
    print(f"  Time: {manual_time:.2f}s")
    print(f"  Accuracy: {manual_acc:.4f}")
    
    # Method 2: cuML Pipeline (cleaner)
    print("\nMethod 2: cuML Pipeline")
    print("-" * 40)
    
    start = time()
    
    cuml_pipeline = CumlPipeline([
        ('scaler', CumlScaler()),
        ('pca', CumlPCA(n_components=20)),
        ('classifier', CumlRF(n_estimators=100, max_depth=16))
    ])
    
    cuml_pipeline.fit(X_train, y_train)
    y_pred_pipe = cuml_pipeline.predict(X_test)
    
    pipe_time = time() - start
    pipe_acc = accuracy_score(y_test, y_pred_pipe.to_numpy() if hasattr(y_pred_pipe, 'to_numpy') else y_pred_pipe)
    
    print(f"  Time: {pipe_time:.2f}s")
    print(f"  Accuracy: {pipe_acc:.4f}")
    
    # Compare with sklearn
    print("\nComparison with sklearn Pipeline:")
    print("-" * 40)
    
    from sklearn.pipeline import Pipeline as SklearnPipeline
    from sklearn.preprocessing import StandardScaler as SklearnScaler
    from sklearn.decomposition import PCA as SklearnPCA
    
    start = time()
    
    sklearn_pipeline = SklearnPipeline([
        ('scaler', SklearnScaler()),
        ('pca', SklearnPCA(n_components=20)),
        ('classifier', SklearnRF(n_estimators=100, max_depth=16, n_jobs=-1))
    ])
    
    sklearn_pipeline.fit(X_train, y_train)
    
    sklearn_time = time() - start
    
    print(f"  sklearn time: {sklearn_time:.2f}s")
    print(f"  cuML time: {pipe_time:.2f}s")
    print(f"  Speedup: {sklearn_time/pipe_time:.1f}x")
else:
    print("""Expected pipeline code:
    
    from cuml.pipeline import Pipeline
    from cuml.preprocessing import StandardScaler
    from cuml.decomposition import PCA
    from cuml.ensemble import RandomForestClassifier
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=20)),
        ('classifier', RandomForestClassifier(n_estimators=100))
    ])
    
    pipeline.fit(X_train, y_train)
    """)

## Exercise 3 Solution: Memory Profiling

In [None]:
# Exercise 3: GPU Memory Profiling

if RAPIDS_AVAILABLE:
    import gc
    
    def get_gpu_memory_mb():
        """Get current GPU memory usage in MB."""
        return cp.get_default_memory_pool().used_bytes() / 1e6
    
    def free_gpu_memory():
        """Free unused GPU memory."""
        gc.collect()
        cp.get_default_memory_pool().free_all_blocks()
    
    print("GPU Memory Profiling")
    print("=" * 50)
    
    # Clean state
    free_gpu_memory()
    baseline_mem = get_gpu_memory_mb()
    print(f"Baseline: {baseline_mem:.1f} MB")
    
    # Load data
    X_large, y_large = make_classification(n_samples=1_000_000, n_features=50, random_state=42)
    X_large = X_large.astype(np.float32)
    print(f"\nAfter generating data (NumPy): {get_gpu_memory_mb():.1f} MB")
    print(f"  Expected: ~0 MB (NumPy uses CPU RAM)")
    
    # Convert to cuDF
    X_gpu = cudf.DataFrame(X_large)
    print(f"\nAfter cuDF conversion: {get_gpu_memory_mb():.1f} MB")
    print(f"  Expected: ~{X_large.nbytes/1e6:.1f} MB (data copied to GPU)")
    
    # Train model
    print(f"\nTraining Random Forest...")
    mem_before_train = get_gpu_memory_mb()
    
    model = CumlRF(n_estimators=50, max_depth=12)
    model.fit(X_gpu, y_large)
    
    mem_after_train = get_gpu_memory_mb()
    print(f"  Before training: {mem_before_train:.1f} MB")
    print(f"  After training: {mem_after_train:.1f} MB")
    print(f"  Model overhead: {mem_after_train - mem_before_train:.1f} MB")
    
    # Cleanup
    print(f"\nCleaning up...")
    del X_gpu, model
    free_gpu_memory()
    print(f"  After cleanup: {get_gpu_memory_mb():.1f} MB")
    
    print("\nBest Practices:")
    print("  1. Use float32 (not float64) to halve memory")
    print("  2. Delete unused objects and call free_gpu_memory()")
    print("  3. Process in batches for very large datasets")
    print("  4. DGX Spark's 128GB unified memory helps a lot!")
else:
    print("""Memory profiling code:
    
    import cupy as cp
    
    def get_gpu_memory_mb():
        return cp.get_default_memory_pool().used_bytes() / 1e6
    
    def free_gpu_memory():
        import gc
        gc.collect()
        cp.get_default_memory_pool().free_all_blocks()
    
    print(f"Before: {get_gpu_memory_mb():.1f} MB")
    # ... do work ...
    print(f"After: {get_gpu_memory_mb():.1f} MB")
    """)

## Key Takeaways

1. **Speedup increases with dataset size** - GPU overhead becomes negligible for large datasets
2. **cuML pipelines work like sklearn** - easy migration with minimal code changes
3. **Memory management matters** - always clean up and use float32
4. **DGX Spark is ideal** - 128GB unified memory eliminates many GPU memory constraints