# Lab A.2: Bias-Variance Decomposition - SOLUTIONS

**Module:** A - Statistical Learning Theory  
**Type:** Solution Notebook

---

This notebook contains complete solutions to all exercises from Lab A.2.

In [None]:
# Setup (same as main notebook)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Helper functions from main notebook

def true_function(x):
    return np.sin(2 * x) + 0.5 * np.cos(4 * x)


def generate_data(n_samples, noise_std=0.3, x_min=0, x_max=4, seed=None):
    if seed is not None:
        np.random.seed(seed)
    X = np.random.uniform(x_min, x_max, n_samples)
    y = true_function(X) + np.random.normal(0, noise_std, n_samples)
    return X.reshape(-1, 1), y


def fit_polynomial(X, y, degree):
    model = make_pipeline(
        PolynomialFeatures(degree, include_bias=False),
        LinearRegression()
    )
    model.fit(X, y)
    return model


def bootstrap_bias_variance(degree, n_samples=100, noise_std=0.3, 
                           n_bootstrap=200, n_test_points=50):
    X_test = np.linspace(0.5, 3.5, n_test_points).reshape(-1, 1)
    y_true_test = true_function(X_test.flatten())
    all_predictions = np.zeros((n_bootstrap, n_test_points))
    
    for i in range(n_bootstrap):
        X_train, y_train = generate_data(n_samples, noise_std, seed=i)
        model = fit_polynomial(X_train, y_train, degree)
        all_predictions[i] = model.predict(X_test)
    
    mean_prediction = np.mean(all_predictions, axis=0)
    bias_squared = np.mean((mean_prediction - y_true_test) ** 2)
    variance = np.mean(np.var(all_predictions, axis=0))
    noise = noise_std ** 2
    total_error = bias_squared + variance + noise
    
    return {
        'degree': degree,
        'bias_squared': bias_squared,
        'variance': variance,
        'noise': noise,
        'total_error': total_error
    }

---

## Exercise 1 Solution: Different Noise Levels

**Task:** How does optimal model complexity change with noise level?

In [None]:
# Solution: Test different noise levels

noise_levels = [0.1, 0.3, 0.6]
degrees_to_test = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

results_by_noise = {}

for noise in noise_levels:
    print(f"\nTesting noise_std = {noise}")
    results = []
    for degree in degrees_to_test:
        result = bootstrap_bias_variance(degree, noise_std=noise, n_bootstrap=100)
        results.append(result)
    results_by_noise[noise] = results
    
    # Find optimal
    total_errors = [r['total_error'] for r in results]
    optimal_idx = np.argmin(total_errors)
    optimal_degree = degrees_to_test[optimal_idx]
    print(f"  Optimal degree: {optimal_degree}")
    print(f"  Min total error: {total_errors[optimal_idx]:.4f}")

In [None]:
# Visualize results
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, noise in zip(axes, noise_levels):
    results = results_by_noise[noise]
    
    biases = [r['bias_squared'] for r in results]
    variances = [r['variance'] for r in results]
    totals = [r['total_error'] for r in results]
    
    ax.plot(degrees_to_test, biases, 'b-o', label='Bias²', linewidth=2)
    ax.plot(degrees_to_test, variances, 'r-o', label='Variance', linewidth=2)
    ax.plot(degrees_to_test, totals, 'g-o', label='Total', linewidth=2)
    ax.axhline(y=noise**2, color='gray', linestyle='--', label=f'Noise={noise**2:.2f}')
    
    # Mark optimal
    optimal_idx = np.argmin(totals)
    ax.scatter([degrees_to_test[optimal_idx]], [totals[optimal_idx]], 
              s=200, color='gold', edgecolors='black', zorder=5, marker='*')
    
    ax.set_xlabel('Polynomial Degree')
    ax.set_ylabel('Error')
    ax.set_title(f'Noise σ = {noise}\nOptimal degree = {degrees_to_test[optimal_idx]}', fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.set_xticks(degrees_to_test)

plt.suptitle('Effect of Noise Level on Optimal Complexity', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("\nKey Finding:")
print("-" * 50)
print("Higher noise levels favor SIMPLER models!")
print("When noise is high, it's harder to distinguish signal")
print("from noise, so simpler models are less likely to")
print("overfit to the noise.")

---

## Exercise 2 Solution: More Training Data

**Task:** Does more data allow more complex models?

In [None]:
# Solution: Test different training set sizes

def bootstrap_bias_variance_custom(degree, n_samples, noise_std=0.3, n_bootstrap=100):
    """Modified to accept variable n_samples."""
    X_test = np.linspace(0.5, 3.5, 50).reshape(-1, 1)
    y_true_test = true_function(X_test.flatten())
    all_predictions = np.zeros((n_bootstrap, 50))
    
    for i in range(n_bootstrap):
        X_train, y_train = generate_data(n_samples, noise_std, seed=i*1000)
        model = fit_polynomial(X_train, y_train, degree)
        all_predictions[i] = model.predict(X_test)
    
    mean_prediction = np.mean(all_predictions, axis=0)
    bias_squared = np.mean((mean_prediction - y_true_test) ** 2)
    variance = np.mean(np.var(all_predictions, axis=0))
    
    return {
        'bias_squared': bias_squared,
        'variance': variance,
        'total_error': bias_squared + variance + noise_std**2
    }


sample_sizes = [50, 200, 500]
degrees_to_test = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15]

results_by_size = {}

for n_samples in sample_sizes:
    print(f"\nTesting n_samples = {n_samples}")
    results = []
    for degree in degrees_to_test:
        result = bootstrap_bias_variance_custom(degree, n_samples, n_bootstrap=50)
        results.append(result)
    results_by_size[n_samples] = results
    
    # Find optimal
    total_errors = [r['total_error'] for r in results]
    optimal_idx = np.argmin(total_errors)
    optimal_degree = degrees_to_test[optimal_idx]
    print(f"  Optimal degree: {optimal_degree}")

In [None]:
# Visualize effect of dataset size
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, n_samples in zip(axes, sample_sizes):
    results = results_by_size[n_samples]
    
    biases = [r['bias_squared'] for r in results]
    variances = [r['variance'] for r in results]
    totals = [r['total_error'] for r in results]
    
    ax.plot(degrees_to_test, biases, 'b-o', label='Bias²', linewidth=2)
    ax.plot(degrees_to_test, variances, 'r-o', label='Variance', linewidth=2)
    ax.plot(degrees_to_test, totals, 'g-o', label='Total', linewidth=2)
    
    # Mark optimal
    optimal_idx = np.argmin(totals)
    ax.scatter([degrees_to_test[optimal_idx]], [totals[optimal_idx]], 
              s=200, color='gold', edgecolors='black', zorder=5, marker='*')
    
    ax.set_xlabel('Polynomial Degree')
    ax.set_ylabel('Error')
    ax.set_title(f'n = {n_samples} samples\nOptimal degree = {degrees_to_test[optimal_idx]}', fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

plt.suptitle('Effect of Dataset Size on Optimal Complexity', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("\nKey Finding:")
print("-" * 50)
print("More data allows MORE COMPLEX models!")
print("With more data:")
print("  - Variance decreases (more stable estimates)")
print("  - Optimal complexity shifts to the RIGHT")
print("  - This is why big data enables deep learning!")

---

## Challenge Solution: Bagging for Variance Reduction

**Task:** Prove that averaging predictions (bagging) reduces variance but not bias.

In [None]:
# Solution: Implement and test bagging

def bagging_predictor(X_train, y_train, X_test, n_models=10, degree=10):
    """
    Train multiple models on bootstrap samples and average predictions.
    
    This should reduce variance while keeping bias the same.
    """
    n_samples = len(X_train)
    all_predictions = []
    
    for i in range(n_models):
        # Bootstrap sample (sample with replacement)
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        X_boot = X_train[indices]
        y_boot = y_train[indices]
        
        # Fit model on bootstrap sample
        model = fit_polynomial(X_boot, y_boot, degree)
        pred = model.predict(X_test)
        all_predictions.append(pred)
    
    # Average predictions
    return np.mean(all_predictions, axis=0)


def compute_bagging_bias_variance(degree, n_models, n_bootstrap=100):
    """
    Compute bias and variance for bagged model.
    """
    X_test = np.linspace(0.5, 3.5, 50).reshape(-1, 1)
    y_true_test = true_function(X_test.flatten())
    all_predictions = np.zeros((n_bootstrap, 50))
    
    for i in range(n_bootstrap):
        # Generate new training set
        X_train, y_train = generate_data(100, noise_std=0.3, seed=i*1000)
        
        # Get bagged prediction
        pred = bagging_predictor(X_train, y_train, X_test, n_models=n_models, degree=degree)
        all_predictions[i] = pred
    
    mean_prediction = np.mean(all_predictions, axis=0)
    bias_squared = np.mean((mean_prediction - y_true_test) ** 2)
    variance = np.mean(np.var(all_predictions, axis=0))
    
    return bias_squared, variance


print("Comparing single model vs bagged model (degree=10):")
print("=" * 55)

# Single model
single_result = bootstrap_bias_variance(degree=10, n_bootstrap=100)
print(f"\nSingle Model:")
print(f"  Bias²: {single_result['bias_squared']:.4f}")
print(f"  Variance: {single_result['variance']:.4f}")

# Bagged models with different ensemble sizes
for n_models in [5, 10, 20]:
    bias, var = compute_bagging_bias_variance(degree=10, n_models=n_models, n_bootstrap=50)
    print(f"\nBagged ({n_models} models):")
    print(f"  Bias²: {bias:.4f}")
    print(f"  Variance: {var:.4f}")
    print(f"  Variance reduction: {(1 - var/single_result['variance'])*100:.1f}%")

In [None]:
# Visualize the variance reduction
ensemble_sizes = [1, 2, 5, 10, 20, 50]
biases = []
variances = []

print("Computing for various ensemble sizes...")
for n in ensemble_sizes:
    if n == 1:
        # Single model
        result = bootstrap_bias_variance(degree=10, n_bootstrap=50)
        biases.append(result['bias_squared'])
        variances.append(result['variance'])
    else:
        bias, var = compute_bagging_bias_variance(degree=10, n_models=n, n_bootstrap=30)
        biases.append(bias)
        variances.append(var)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(ensemble_sizes, biases, 'b-o', linewidth=2, markersize=10, label='Bias²')
ax.plot(ensemble_sizes, variances, 'r-o', linewidth=2, markersize=10, label='Variance')

# Theoretical variance decay (1/n)
theoretical_var = [variances[0] / n for n in ensemble_sizes]
ax.plot(ensemble_sizes, theoretical_var, 'r--', linewidth=1, alpha=0.5, label='Theoretical 1/n decay')

ax.set_xlabel('Number of Models in Ensemble', fontsize=12)
ax.set_ylabel('Error Component', fontsize=12)
ax.set_title('Bagging: Variance Reduction with Ensemble Size\n(Bias stays constant!)', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_xscale('log')

plt.tight_layout()
plt.show()

print("\nKey Finding:")
print("-" * 50)
print("Bagging REDUCES VARIANCE but leaves BIAS unchanged!")
print("This is because:")
print("  - Averaging independent estimates reduces their variance")
print("  - But the systematic error (bias) is the same for all models")
print("  - Variance decreases roughly as 1/n (n = ensemble size)")

### Solution Notes

1. **Exercise 1 (Noise Levels)**:
   - Low noise (σ=0.1): Can use more complex models (optimal ~degree 5-6)
   - High noise (σ=0.6): Need simpler models (optimal ~degree 2-3)
   - High noise makes it harder to distinguish signal from noise

2. **Exercise 2 (Dataset Size)**:
   - More data → Lower variance → Can use more complex models
   - n=50: Optimal degree ~4
   - n=500: Optimal degree ~6-8
   - This is why big data enables deep learning!

3. **Challenge (Bagging)**:
   - Bagging reduces variance by averaging independent predictions
   - Bias stays constant because all models have the same systematic error
   - Variance decreases roughly as 1/n where n = number of models
   - This is the theoretical foundation for Random Forests!

---

## Key Takeaways

1. **Noise level** affects optimal complexity: higher noise → simpler models
2. **More data** reduces variance: larger datasets → can use more complex models
3. **Bagging** is a variance reduction technique: average models to reduce instability
4. **The fundamental tradeoff** remains: you can't eliminate both bias and variance