# Risk & Distribution Estimation - Cheat Sheet

## Key Concepts
1. **Empirical Distribution Function (EDF)**: Estimate unknown distribution from data
2. **DKW Inequality**: Provides confidence bands for the EDF
3. **Density Estimation**: PMF for discrete data, histograms for continuous data
4. **Risk Minimization**: Finding the best model by minimizing loss (e.g., linear regression)

---
## 1. Empirical Distribution Function (EDF)

**What it is:** A way to estimate the unknown distribution function $F^*$ from data.

**Formula:** $\widehat{F}_n(x) = \frac{1}{n} \sum_{i=1}^n \mathbf{1}_{[X_i,+\infty)}(x)$

**In words:** For each point $x$, count how many data points are $\leq x$, then divide by total number of data points.

In [None]:
# Setup
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# CHANGE THESE PARAMETERS
distribution_type = 'poisson'  # Options: 'poisson', 'normal', 'exponential', 'uniform'
sample_size = 1000
param1 = 1  # For poisson: lambda, normal: mean, exponential: lambda, uniform: low
param2 = 0  # For normal: std, uniform: high (otherwise unused)

# Generate data based on distribution type
if distribution_type == 'poisson':
    data = np.random.poisson(param1, size=sample_size)
elif distribution_type == 'normal':
    data = np.random.normal(param1, param2 if param2 != 0 else 1, size=sample_size)
elif distribution_type == 'exponential':
    data = np.random.exponential(1/param1, size=sample_size)
elif distribution_type == 'uniform':
    data = np.random.uniform(param1, param2 if param2 != 0 else param1+1, size=sample_size)

print(f"Generated {sample_size} samples from {distribution_type} distribution")
print(f"First 10 samples: {data[:10]}")

In [None]:
# Function to create EDF
def makeEDF(data):
    """Create Empirical Distribution Function from data"""
    sorted_data = np.sort(data)
    n = len(data)
    # Create step function: for each unique value, compute proportion <= that value
    unique_vals = np.unique(sorted_data)
    edf_vals = np.array([np.sum(data <= val) / n for val in unique_vals])
    return unique_vals, edf_vals

# Function to plot EDF
def plotEDF(x_vals, edf_vals, title="Empirical Distribution Function"):
    """Plot the Empirical Distribution Function"""
    plt.figure(figsize=(10, 6))
    plt.step(x_vals, edf_vals, where='post', linewidth=2, label='EDF')
    plt.xlabel('x')
    plt.ylabel('F(x)')
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()

# Create and plot EDF
x_vals, edf_vals = makeEDF(data)
plotEDF(x_vals, edf_vals, f"EDF for {distribution_type} distribution (n={sample_size})")

---
## 2. DKW Inequality & Confidence Bands

**What it is:** Tells us how confident we can be that our EDF is close to the true distribution.

**DKW Inequality:** $P\left( \sup_x | \widehat{F}_n(x) - F^*(x) | > \varepsilon \right) \leq 2 \exp(-2n\varepsilon^2)$

**Confidence Band:** With probability $\geq 1-\alpha$, the true $F^*$ lies within:
- $\underline{C}_n(x) = \max\{\widehat{F}_n(x) - \varepsilon_n, 0\}$
- $\overline{C}_n(x) = \min\{\widehat{F}_n(x) + \varepsilon_n, 1\}$
- where $\varepsilon_n = \sqrt{\frac{1}{2n}\log(\frac{2}{\alpha})}$

In [None]:
# CHANGE THIS PARAMETER
confidence_level = 0.95  # Common values: 0.90, 0.95, 0.99

# Calculate confidence bands using DKW inequality
alpha = 1 - confidence_level
n = len(data)
epsilon_n = np.sqrt((1/(2*n)) * np.log(2/alpha))

print(f"Confidence level: {confidence_level*100}%")
print(f"Sample size: {n}")
print(f"Epsilon (band width): {epsilon_n:.4f}")

# Compute confidence bands
x_vals, edf_vals = makeEDF(data)
lower_band = np.maximum(edf_vals - epsilon_n, 0)
upper_band = np.minimum(edf_vals + epsilon_n, 1)

# Plot EDF with confidence bands
plt.figure(figsize=(12, 6))
plt.step(x_vals, edf_vals, where='post', linewidth=2, label='EDF', color='blue')
plt.step(x_vals, lower_band, where='post', linewidth=1.5, linestyle='--', 
         label=f'{confidence_level*100}% Lower Band', color='red')
plt.step(x_vals, upper_band, where='post', linewidth=1.5, linestyle='--', 
         label=f'{confidence_level*100}% Upper Band', color='green')
plt.fill_between(x_vals, lower_band, upper_band, alpha=0.2, step='post', color='gray')
plt.xlabel('x')
plt.ylabel('F(x)')
plt.title(f'EDF with {confidence_level*100}% Confidence Bands (DKW Inequality)')
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

---
## 3. Estimating Density

### 3.1 For Discrete Data: Empirical PMF (Probability Mass Function)

**What it is:** The proportion of times each value appears in the data.

In [None]:
# Generate discrete data (e.g., Poisson)
# CHANGE THESE PARAMETERS
discrete_distribution = 'poisson'  # Options: 'poisson', 'binomial', 'geometric'
discrete_sample_size = 1000
discrete_param1 = 3  # For poisson: lambda, binomial: n, geometric: p
discrete_param2 = 0.5  # For binomial: p (otherwise unused)

# Generate discrete data
if discrete_distribution == 'poisson':
    discrete_data = np.random.poisson(discrete_param1, size=discrete_sample_size)
elif discrete_distribution == 'binomial':
    discrete_data = np.random.binomial(int(discrete_param1), discrete_param2, size=discrete_sample_size)
elif discrete_distribution == 'geometric':
    discrete_data = np.random.geometric(discrete_param1, size=discrete_sample_size)

# Function to create Empirical PMF
def makeEMF(data):
    """Create Empirical Mass Function (PMF) from discrete data"""
    unique_vals, counts = np.unique(data, return_counts=True)
    pmf = counts / len(data)
    return unique_vals, pmf

# Create and plot EMF
vals, pmf = makeEMF(discrete_data)

plt.figure(figsize=(12, 6))
plt.bar(vals, pmf, width=0.8, alpha=0.7, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Probability')
plt.title(f'Empirical PMF for {discrete_distribution} distribution (n={discrete_sample_size})')
plt.grid(True, alpha=0.3, axis='y')
plt.show()

# Print summary
print(f"Value\tProbability")
for v, p in zip(vals[:10], pmf[:10]):  # Show first 10
    print(f"{v}\t{p:.4f}")

### 3.2 For Continuous Data: Histogram

**What it is:** Divide the data range into bins and count how many observations fall in each bin.

In [None]:
# Generate continuous data
# CHANGE THESE PARAMETERS
continuous_distribution = 'normal'  # Options: 'normal', 'exponential', 'uniform', 'beta'
continuous_sample_size = 1000
continuous_param1 = 0  # For normal: mean, exponential: lambda, uniform: low, beta: alpha
continuous_param2 = 1  # For normal: std, uniform: high, beta: beta
num_bins = 30  # Number of histogram bins

# Generate continuous data
if continuous_distribution == 'normal':
    continuous_data = np.random.normal(continuous_param1, continuous_param2, size=continuous_sample_size)
elif continuous_distribution == 'exponential':
    continuous_data = np.random.exponential(1/continuous_param1 if continuous_param1 != 0 else 1, 
                                           size=continuous_sample_size)
elif continuous_distribution == 'uniform':
    continuous_data = np.random.uniform(continuous_param1, continuous_param2, size=continuous_sample_size)
elif continuous_distribution == 'beta':
    continuous_data = np.random.beta(continuous_param1, continuous_param2, size=continuous_sample_size)

# Create histogram
freq, bins, _ = plt.hist(continuous_data, bins=num_bins, density=True, 
                         alpha=0.7, edgecolor='black', label='Histogram')

plt.xlabel('Value')
plt.ylabel('Density')
plt.title(f'Histogram for {continuous_distribution} distribution (n={continuous_sample_size}, bins={num_bins})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Print bin information (first 10 bins)
print(f"{'Bin Range':<20}\tDensity")
print("-" * 40)
for i, (f, l_edge, r_edge) in enumerate(zip(freq[:10], bins[:10], bins[1:11])):
    print(f"[{l_edge:.2f}, {r_edge:.2f}]\t{f:.4f}")

---
## 4. Risk Minimization: Linear Regression

**Problem:** Find the best linear function $g(x) = kx + m$ that predicts $Y$ from $X$.

**Method:** Minimize the average squared error (quadratic loss):

$$k^*, m^* = \text{argmin}_{k,m} \frac{1}{n}\sum_{i=1}^n (kX_i + m - Y_i)^2$$

**In words:** Find the slope $k$ and intercept $m$ that minimize the average squared difference between predictions and actual values.

In [None]:
# Generate synthetic regression data
# CHANGE THESE PARAMETERS
true_slope = 2.5
true_intercept = 10
noise_level = 5  # Standard deviation of noise
n_samples = 100
x_min, x_max = 0, 10

# Generate data: Y = true_slope * X + true_intercept + noise
np.random.seed(42)  # For reproducibility
X_reg = np.random.uniform(x_min, x_max, n_samples)
Y_reg = true_slope * X_reg + true_intercept + np.random.normal(0, noise_level, n_samples)

# Plot the data
plt.figure(figsize=(12, 6))
plt.scatter(X_reg, Y_reg, alpha=0.6, s=50, label='Data points')
plt.xlabel('X')
plt.ylabel('Y')
plt.title(f'Generated Data (True: Y = {true_slope}X + {true_intercept} + noise)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Generated {n_samples} data points")
print(f"True relationship: Y = {true_slope}X + {true_intercept} + noise(std={noise_level})")
print(f"X range: [{x_min}, {x_max}]")

In [None]:
# METHOD 1: Using scipy optimization
from scipy import optimize

# Define the risk (mean squared error)
def compute_risk(params, X, Y):
    """Compute mean squared error for linear model Y_pred = params[0]*X + params[1]"""
    k, m = params
    predictions = k * X + m
    mse = np.mean((predictions - Y) ** 2)
    return mse

# Initial guess
initial_params = [0, 0]

# Minimize the risk
result = optimize.minimize(compute_risk, initial_params, args=(X_reg, Y_reg), method='Nelder-Mead')

k_opt, m_opt = result.x
print("=== Optimization Results (scipy) ===")
print(f"Optimal slope (k): {k_opt:.4f}")
print(f"Optimal intercept (m): {m_opt:.4f}")
print(f"Final risk (MSE): {result.fun:.4f}")
print(f"True values: slope={true_slope}, intercept={true_intercept}")

In [None]:
# METHOD 2: Using closed-form solution (Normal Equations)
# For linear regression, we can solve directly: (X^T X)^-1 X^T Y

# Add column of ones for intercept
X_with_intercept = np.column_stack([X_reg, np.ones(len(X_reg))])

# Closed-form solution: theta = (X^T X)^-1 X^T Y
theta = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ Y_reg
k_closed, m_closed = theta

print("\n=== Closed-Form Solution ===")
print(f"Optimal slope (k): {k_closed:.4f}")
print(f"Optimal intercept (m): {m_closed:.4f}")

# Compute MSE for closed-form solution
predictions_closed = k_closed * X_reg + m_closed
mse_closed = np.mean((predictions_closed - Y_reg) ** 2)
print(f"Final risk (MSE): {mse_closed:.4f}")

In [None]:
# Plot the fitted line
x_plot = np.linspace(x_min, x_max, 100)
y_pred_scipy = k_opt * x_plot + m_opt
y_pred_closed = k_closed * x_plot + m_closed
y_true = true_slope * x_plot + true_intercept

plt.figure(figsize=(12, 7))
plt.scatter(X_reg, Y_reg, alpha=0.6, s=50, label='Data points', color='blue')
plt.plot(x_plot, y_pred_scipy, 'r-', linewidth=2, 
         label=f'Scipy: Y = {k_opt:.2f}X + {m_opt:.2f}')
plt.plot(x_plot, y_pred_closed, 'g--', linewidth=2, 
         label=f'Closed-form: Y = {k_closed:.2f}X + {m_closed:.2f}')
plt.plot(x_plot, y_true, 'k:', linewidth=2, alpha=0.5,
         label=f'True: Y = {true_slope}X + {true_intercept}')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Linear Regression: Risk Minimization')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

---
## 5. Real Data Example: House Prices

Let's apply linear regression to real house price data.

In [None]:
# For demonstration, let's create synthetic house price data
# In practice, you would load from CSV: pd.read_csv('portland.csv')

# CHANGE THESE TO USE YOUR OWN DATA
# If you have a CSV file, uncomment and modify:
# import pandas as pd
# df = pd.read_csv('your_data.csv')
# X_houses = df['size'].values  # or df.iloc[:, 0]
# Y_prices = df['price'].values  # or df.iloc[:, 2]

# For now, generate synthetic house data
np.random.seed(123)
n_houses = 50
house_sizes = np.random.uniform(1000, 4000, n_houses)  # Square feet
# Price roughly $100 per sq ft + base of $50k + noise
house_prices = 100 * house_sizes + 50000 + np.random.normal(0, 20000, n_houses)

X_houses = house_sizes
Y_prices = house_prices

# Visualize the data
plt.figure(figsize=(12, 6))
plt.scatter(X_houses, Y_prices, alpha=0.6, s=80, edgecolors='black')
plt.xlabel('House Size (sq ft)')
plt.ylabel('Price ($)')
plt.title('House Prices vs Size')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Number of houses: {len(X_houses)}")
print(f"Size range: {X_houses.min():.0f} - {X_houses.max():.0f} sq ft")
print(f"Price range: ${Y_prices.min():.0f} - ${Y_prices.max():.0f}")

In [None]:
# Fit linear regression model
def fit_linear_regression(X, Y):
    """Fit linear regression using closed-form solution"""
    # Add intercept term
    X_design = np.column_stack([X, np.ones(len(X))])
    # Solve: theta = (X^T X)^-1 X^T Y
    theta = np.linalg.inv(X_design.T @ X_design) @ X_design.T @ Y
    slope, intercept = theta
    
    # Calculate statistics
    predictions = slope * X + intercept
    residuals = Y - predictions
    mse = np.mean(residuals ** 2)
    rmse = np.sqrt(mse)
    
    # R-squared
    ss_total = np.sum((Y - np.mean(Y)) ** 2)
    ss_residual = np.sum(residuals ** 2)
    r_squared = 1 - (ss_residual / ss_total)
    
    return {
        'slope': slope,
        'intercept': intercept,
        'mse': mse,
        'rmse': rmse,
        'r_squared': r_squared,
        'predictions': predictions
    }

# Fit the model
results = fit_linear_regression(X_houses, Y_prices)

print("=== Linear Regression Results ===")
print(f"Equation: Price = {results['slope']:.2f} × Size + {results['intercept']:.2f}")
print(f"\nInterpretation:")
print(f"  - Price increases by ${results['slope']:.2f} per square foot")
print(f"  - Base price (at 0 sq ft): ${results['intercept']:.2f}")
print(f"\nModel Quality:")
print(f"  - RMSE: ${results['rmse']:.2f} (average prediction error)")
print(f"  - R²: {results['r_squared']:.4f} (proportion of variance explained)")

# Make a prediction
# CHANGE THIS to predict for different house sizes
new_house_size = 2500
predicted_price = results['slope'] * new_house_size + results['intercept']
print(f"\n=== Prediction ===")
print(f"A {new_house_size} sq ft house is predicted to cost: ${predicted_price:,.2f}")

In [None]:
# Visualize the regression line
x_range = np.linspace(X_houses.min(), X_houses.max(), 100)
y_pred_line = results['slope'] * x_range + results['intercept']

plt.figure(figsize=(14, 7))

# Main plot
plt.subplot(1, 2, 1)
plt.scatter(X_houses, Y_prices, alpha=0.6, s=80, edgecolors='black', label='Actual prices')
plt.plot(x_range, y_pred_line, 'r-', linewidth=2, 
         label=f"Y = {results['slope']:.1f}X + {results['intercept']:.0f}")
plt.xlabel('House Size (sq ft)', fontsize=12)
plt.ylabel('Price ($)', fontsize=12)
plt.title(f"Linear Regression (R² = {results['r_squared']:.3f})", fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)

# Residual plot
plt.subplot(1, 2, 2)
residuals = Y_prices - results['predictions']
plt.scatter(results['predictions'], residuals, alpha=0.6, s=80, edgecolors='black')
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Predicted Price ($)', fontsize=12)
plt.ylabel('Residuals ($)', fontsize=12)
plt.title('Residual Plot', fontsize=14)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---
## Summary of Key Functions

### Quick Reference

```python
# 1. Empirical Distribution Function
x_vals, edf_vals = makeEDF(data)
plotEDF(x_vals, edf_vals)

# 2. DKW Confidence Bands
epsilon_n = np.sqrt((1/(2*n)) * np.log(2/alpha))
lower_band = np.maximum(edf_vals - epsilon_n, 0)
upper_band = np.minimum(edf_vals + epsilon_n, 1)

# 3. Empirical PMF (discrete data)
vals, pmf = makeEMF(discrete_data)

# 4. Histogram (continuous data)
plt.hist(continuous_data, bins=30, density=True)

# 5. Linear Regression
results = fit_linear_regression(X, Y)
predicted_y = results['slope'] * new_x + results['intercept']
```

### When to Use Each Method

- **EDF**: When you want to estimate the cumulative distribution from data
- **DKW Bands**: When you need confidence intervals for your distribution estimate
- **PMF**: For discrete random variables (counts, categories)
- **Histogram**: For continuous random variables (measurements)
- **Linear Regression**: When you want to predict one variable from another with a linear relationship