# 03. Calibration Demo

Notebook demo tính năng calibration để hiệu chỉnh forecast khớp với actual DEL curves.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

In [None]:
from config import (CFG, SEGMENT_COLS, BUCKETS_CANON, BUCKETS_30P, BUCKETS_60P, BUCKETS_90P,
                    ABSORBING_BASE, MAX_MOB, DENOM_LEVEL, K_CLIP)
from data_io import load_parquet, validate_schema
from transitions import prepare_transitions, estimate_transition_matrices
from forecast import build_initial_vectors, forecast
from calibration import fit_del_curve_factors, apply_matrix_calibration
from metrics import compute_del_from_snapshot, compute_del_from_forecast, make_mixed_report

print(f'K_CLIP bounds: {K_CLIP}')

## Step 1: Load Data and Run Basic Pipeline

In [None]:
# Load and prepare data
df = load_parquet('../Oct25.parquet')
df_trans = prepare_transitions(df, CFG, SEGMENT_COLS, BUCKETS_CANON, ABSORBING_BASE)

# Estimate transition matrices
segment_levels = [("GLOBAL", []), ("COARSE", [SEGMENT_COLS[0]] if SEGMENT_COLS else []), ("FULL", SEGMENT_COLS)]
prior_strengths = {"coarse": 100.0, "full": 50.0}

transitions_dict, transitions_long_df, meta_df = estimate_transition_matrices(
    df_trans, CFG, BUCKETS_CANON, segment_levels, MAX_MOB, "ead", 30, prior_strengths, None)

# Build initial vectors and forecast
df_init, denom_map = build_initial_vectors(df, CFG, BUCKETS_CANON, SEGMENT_COLS, DENOM_LEVEL)
forecast_df = forecast(df_init, transitions_dict, BUCKETS_CANON, MAX_MOB)

print(f"Data shape: {df.shape}")
print(f"Transitions: {df_trans.shape}")
print(f"Forecast: {forecast_df.shape}")

## Step 2: Compute Actual vs Predicted DEL

In [None]:
# Compute actual DEL from snapshot
actual_del_long, actual_denom = compute_del_from_snapshot(
    df, CFG, BUCKETS_30P, SEGMENT_COLS, MAX_MOB, DENOM_LEVEL
)

# Compute predicted DEL from forecast
pred_del_long = compute_del_from_forecast(forecast_df, BUCKETS_30P, denom_map)

print(f"Actual DEL records: {len(actual_del_long)}")
print(f"Predicted DEL records: {len(pred_del_long)}")

# Preview
print("\nActual DEL sample:")
display(actual_del_long.head())

print("\nPredicted DEL sample:")
display(pred_del_long.head())

## Step 3: Fit Calibration Factors

In [None]:
# Fit calibration factors
factors_df = fit_del_curve_factors(actual_del_long, pred_del_long, MAX_MOB, K_CLIP)

print("Calibration Factors by MOB:")
display(factors_df)

# Plot calibration factors
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(factors_df['mob'], factors_df['k'], 'bo-', label='K Factor')
plt.axhline(y=1.0, color='r', linestyle='--', alpha=0.7, label='K=1 (No adjustment)')
plt.axhline(y=K_CLIP[0], color='orange', linestyle=':', alpha=0.7, label=f'K_min={K_CLIP[0]}')
plt.axhline(y=K_CLIP[1], color='orange', linestyle=':', alpha=0.7, label=f'K_max={K_CLIP[1]}')
plt.xlabel('MOB')
plt.ylabel('Calibration Factor (K)')
plt.title('Calibration Factors by MOB')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(factors_df['mob'], factors_df['actual_mean'], 'go-', label='Actual DEL')
plt.plot(factors_df['mob'], factors_df['pred_mean'], 'ro-', label='Predicted DEL')
plt.xlabel('MOB')
plt.ylabel('DEL Rate')
plt.title('Actual vs Predicted DEL Curves')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Step 4: Apply Calibration to Transition Matrices

In [None]:
# Apply calibration to all transition matrices
transitions_dict_calibrated = {}

print("Applying calibration to transition matrices...")
for key, matrix in transitions_dict.items():
    level, segment_key, mob = key
    
    # Get calibration factor for this MOB
    k_factor = factors_df[factors_df['mob'] == mob]['k'].iloc[0] if mob < len(factors_df) else 1.0
    
    # Apply calibration
    calibrated_matrix = apply_matrix_calibration(
        matrix, BUCKETS_30P, k_factor, ABSORBING_BASE
    )
    
    transitions_dict_calibrated[key] = calibrated_matrix

print(f"Calibrated {len(transitions_dict_calibrated)} matrices")

# Compare one matrix before/after calibration
sample_key = list(transitions_dict.keys())[0]
level, segment_key, mob = sample_key
k_factor = factors_df[factors_df['mob'] == mob]['k'].iloc[0]

print(f"\nSample Matrix Comparison (Level: {level}, Segment: {segment_key}, MOB: {mob}, K: {k_factor:.3f})")
print("\nOriginal Matrix:")
display(transitions_dict[sample_key].round(4))

print("\nCalibrated Matrix:")
display(transitions_dict_calibrated[sample_key].round(4))

## Step 5: Re-forecast with Calibrated Matrices

In [None]:
# Re-forecast with calibrated matrices
forecast_df_calibrated = forecast(df_init, transitions_dict_calibrated, BUCKETS_CANON, MAX_MOB)

# Compute calibrated predicted DEL
pred_del_long_calibrated = compute_del_from_forecast(forecast_df_calibrated, BUCKETS_30P, denom_map)

print(f"Calibrated forecast shape: {forecast_df_calibrated.shape}")
print(f"Calibrated predicted DEL records: {len(pred_del_long_calibrated)}")

## Step 6: Compare Results

In [None]:
# Aggregate DEL by MOB for comparison
actual_agg = actual_del_long.groupby('mob')['del_pct'].mean().reset_index()
pred_agg = pred_del_long.groupby('mob')['del_pct'].mean().reset_index()
pred_calib_agg = pred_del_long_calibrated.groupby('mob')['del_pct'].mean().reset_index()

# Plot comparison
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(actual_agg['mob'], actual_agg['del_pct'], 'go-', label='Actual', linewidth=2)
plt.plot(pred_agg['mob'], pred_agg['del_pct'], 'ro-', label='Original Forecast', alpha=0.7)
plt.plot(pred_calib_agg['mob'], pred_calib_agg['del_pct'], 'bo-', label='Calibrated Forecast')
plt.xlabel('MOB')
plt.ylabel('DEL Rate')
plt.title('DEL Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 2)
# Error before calibration
merged_orig = actual_agg.merge(pred_agg, on='mob', suffixes=('_actual', '_pred'))
merged_orig['error'] = merged_orig['del_pct_pred'] - merged_orig['del_pct_actual']

plt.plot(merged_orig['mob'], merged_orig['error'], 'ro-', label='Original Error')
plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.xlabel('MOB')
plt.ylabel('Forecast Error (Pred - Actual)')
plt.title('Forecast Error Before Calibration')
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 3)
# Error after calibration
merged_calib = actual_agg.merge(pred_calib_agg, on='mob', suffixes=('_actual', '_pred'))
merged_calib['error'] = merged_calib['del_pct_pred'] - merged_calib['del_pct_actual']

plt.plot(merged_calib['mob'], merged_calib['error'], 'bo-', label='Calibrated Error')
plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.xlabel('MOB')
plt.ylabel('Forecast Error (Pred - Actual)')
plt.title('Forecast Error After Calibration')
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 4)
# Error comparison
plt.plot(merged_orig['mob'], np.abs(merged_orig['error']), 'ro-', label='|Original Error|', alpha=0.7)
plt.plot(merged_calib['mob'], np.abs(merged_calib['error']), 'bo-', label='|Calibrated Error|')
plt.xlabel('MOB')
plt.ylabel('Absolute Forecast Error')
plt.title('Absolute Error Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
orig_mae = np.mean(np.abs(merged_orig['error']))
calib_mae = np.mean(np.abs(merged_calib['error']))

print(f"\n=== CALIBRATION SUMMARY ===")
print(f"Original MAE: {orig_mae:.4f}")
print(f"Calibrated MAE: {calib_mae:.4f}")
print(f"MAE Improvement: {((orig_mae - calib_mae) / orig_mae * 100):.1f}%")

## Step 7: Create Mixed Reports

In [None]:
# Create mixed reports for both original and calibrated
mixed_orig, flags_orig, actual_wide, forecast_orig = make_mixed_report(
    actual_del_long, pred_del_long, MAX_MOB
)

mixed_calib, flags_calib, _, forecast_calib = make_mixed_report(
    actual_del_long, pred_del_long_calibrated, MAX_MOB
)

print("Mixed Report Shapes:")
print(f"Original Mixed: {mixed_orig.shape}")
print(f"Calibrated Mixed: {mixed_calib.shape}")

# Sample comparison
sample_idx = 0
print(f"\nSample Row {sample_idx} Comparison:")
print("\nOriginal Mixed:")
display(mixed_orig.iloc[[sample_idx]])

print("\nCalibrated Mixed:")
display(mixed_calib.iloc[[sample_idx]])

print("\nFlags (same for both):")
display(flags_orig.iloc[[sample_idx]])

## Usage Guide

### Cách sử dụng Calibration:

1. **Bật/tắt calibration**: Thay đổi `ENABLE_CALIBRATION = True/False` trong notebook chính

2. **Điều chỉnh K_CLIP**: Sửa trong `config.py`:
   ```python
   K_CLIP = (0.5, 2.0)  # (k_min, k_max)
   ```

3. **CLI usage**:
   ```bash
   python main.py --input data.parquet --output report.xlsx --calibrate
   ```

4. **Khi nào nên dùng calibration**:
   - Forecast DEL khác biệt đáng kể so với actual
   - Cần điều chỉnh forecast để khớp với observed patterns
   - Có đủ actual data để tính reliable calibration factors

5. **Lưu ý**:
   - Calibration chỉ điều chỉnh forecast, không thay đổi actual
   - K factors được clip trong khoảng [k_min, k_max] để tránh extreme adjustments
   - Calibration áp dụng cho bad states (BUCKETS_30P) trong transition matrices