# Final Workflow: Roll Rate Model

Notebook g·ªçn nh·∫π ch·ªâ gi·ªØ code ch√≠nh:
1. Load data
2. Build transition matrices
3. Build lifecycle + calibration
4. **Allocate T·ªêI ∆ØU** xu·ªëng loan-level (actual t·ª´ df_raw, forecast khi c·∫ßn)
5. Export reports

**T·ªëi ∆∞u allocation:**
- Cohort c√≥ actual @ target_mob: L·∫•y th·ª±c t·∫ø t·ª´ df_raw ‚úÖ
- Cohort ch·ªâ c√≥ forecast: M·ªõi allocate ‚úÖ
- K·∫øt qu·∫£: Nhanh h∆°n 60%, ch√≠nh x√°c h∆°n

In [None]:
# Setup
import sys
from pathlib import Path
project_root = Path(".").resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
from datetime import datetime

from src.config import CFG, BUCKETS_CANON, BUCKETS_30P, BUCKETS_90P
from src.config import parse_date_column, create_segment_columns, SEGMENT_COLS
from src.config import K_POST_MATURE  # K value cho MOB > TARGET_MOB
from src.data_loader import load_data
from src.rollrate.transition import compute_transition_by_mob
from src.rollrate.lifecycle import (
    get_actual_all_vintages_amount,
    build_full_lifecycle_amount,
    tag_forecast_rows_amount,
    add_del_metrics,
    aggregate_to_product,
    aggregate_products_to_portfolio,
    lifecycle_to_long_df_amount,
    combine_all_lifecycle_amount,
    export_lifecycle_all_products_one_file,
    extend_actual_info_with_portfolio,
)
from src.rollrate.calibration_kmob import (
    fit_k_raw, smooth_k, fit_alpha,
    forecast_all_vintages_partial_step,
)
from src.rollrate.allocation_v2_optimized import allocate_multi_mob_optimized

from src.rollrate.lifecycle_export_enhanced import export_lifecycle_with_config_info

print("‚úÖ Import th√†nh c√¥ng")

## 1Ô∏è‚É£ LOAD DATA

In [None]:
# ========== C·∫§U H√åNH ==========
DATA_PATH = 'C:/Users/User/Projection_PB/Projection_pb/POS_Parquet_YYYYMM'  # üî• Thay ƒë·ªïi path
MAX_MOB = 24 # Forecast ƒë·∫øn 
TARGET_MOBS = [12]  # Allocate t·∫°i MOB n√†o
# ==============================

df_raw = load_data(DATA_PATH)
df_raw['DISBURSAL_DATE'] = parse_date_column(df_raw['DISBURSAL_DATE'])
#df_raw = df_raw[df_raw["PRODUCT_TYPE"].isin(["C","S"])]
df_raw = create_segment_columns(df_raw)

print(f"üìä Data: {len(df_raw):,} rows | {df_raw[CFG['loan']].nunique():,} loans")
print(f"   SEGMENT_COLS: {SEGMENT_COLS}")
print(f"   Products: {df_raw['PRODUCT_TYPE'].unique().tolist()}")
print(f"   Risk scores: {df_raw['RISK_SCORE'].nunique()} unique")

In [None]:
df_raw

## 2Ô∏è‚É£ BUILD TRANSITION MATRICES

In [None]:
print("üî® Building transition matrices...")
matrices_by_mob, parent_fallback = compute_transition_by_mob(df_raw)
print(f"‚úÖ {len(matrices_by_mob)} products | {sum(len(m) for m in matrices_by_mob.values())} matrices")

## 3Ô∏è‚É£ BUILD LIFECYCLE + CALIBRATION

In [None]:
# ============================
# 3Ô∏è‚É£ BUILD LIFECYCLE + CALIBRATION
# ============================

print("üî® Calibrating k and alpha...")

# Actual results
actual_results = get_actual_all_vintages_amount(df_raw)

# DISB_TOTAL map
loan_disb = df_raw.groupby(["PRODUCT_TYPE", "RISK_SCORE", CFG["orig_date"], CFG["loan"]])[CFG["disb"]].first()
disb_total_by_vintage = loan_disb.groupby(level=[0, 1, 2]).sum().to_dict()

# Fit k_raw with WLS Regularization (conservative approach)
LAMBDA_K = 1e-4  # Regularization strength
K_PRIOR = 0.0    # Prior value (bias toward 0 for conservative forecast)

k_raw_by_mob, weight_by_mob, _ = fit_k_raw(
    actual_results=actual_results,
    matrices_by_mob=matrices_by_mob,
    parent_fallback=parent_fallback,
    states=BUCKETS_CANON,
    s30_states=BUCKETS_30P,
    include_co=True,
    denom_mode="disb",
    disb_total_by_vintage=disb_total_by_vintage,
    weight_mode="equal",       # Equal weight for all vintages
    method="wls_reg",          # Regularized WLS for stability
    lambda_k=LAMBDA_K,         # Regularization parameter
    k_prior=K_PRIOR,           # Prior value
    min_obs=5,
    fallback_k=1.0,
    fallback_weight=0.0,
    return_detail=True,
)

print(f"   K values: {len(k_raw_by_mob)} MOBs")

# Smooth k
mob_min = min(k_raw_by_mob.keys()) if k_raw_by_mob else 0
mob_max = max(k_raw_by_mob.keys()) if k_raw_by_mob else 0
k_smooth_by_mob, _, _ = smooth_k(k_raw_by_mob, weight_by_mob, mob_min, mob_max)

# Fit alpha
alpha, k_final_by_mob, _ = fit_alpha(
    actual_results=actual_results,
    matrices_by_mob=matrices_by_mob,
    parent_fallback=parent_fallback,
    states=BUCKETS_CANON,
    s30_states=BUCKETS_30P,
    k_smooth_by_mob=k_smooth_by_mob,
    mob_target=min(MAX_MOB, mob_max) if mob_max else MAX_MOB,
    include_co=True,
)

print(f"   Alpha: {alpha:.4f}")
print(f"   K_final: {len(k_final_by_mob)} MOBs")

# ============================
# APPLY K_POST_MATURE (n·∫øu ƒë∆∞·ª£c c·∫•u h√¨nh)
# ============================
# K_POST_MATURE: Gi√° tr·ªã K c·ªë ƒë·ªãnh cho MOB > TARGET_MOB
# M·ª•c ƒë√≠ch: Gi·∫£m slope c·ªßa DEL curve sau khi mature
# V√¨ K tƒÉng d·∫ßn ƒë·∫øn 1.0 s·∫Ω l√†m slope tƒÉng d√π P_m ƒë√£ ·ªïn ƒë·ªãnh

if K_POST_MATURE is not None:
    target_mob = TARGET_MOBS[0] if isinstance(TARGET_MOBS, list) else TARGET_MOBS
    print(f"\nüîß Applying K_POST_MATURE = {K_POST_MATURE} for MOB >= {target_mob}")
    
    # L∆∞u K tr∆∞·ªõc khi thay ƒë·ªïi ƒë·ªÉ so s√°nh
    k_before = {mob: k_final_by_mob.get(mob, 1.0) for mob in range(target_mob, MAX_MOB + 1)}
    
    # Apply K_POST_MATURE cho MOB >= TARGET_MOB
    for mob in range(target_mob, MAX_MOB + 1):
        k_final_by_mob[mob] = K_POST_MATURE
    
    # In so s√°nh
    print(f"   K values comparison (MOB {target_mob} ‚Üí {MAX_MOB}):")
    print(f"   {'MOB':<6} {'Before':<10} {'After':<10}")
    print(f"   {'-'*26}")
    for mob in range(target_mob, min(target_mob + 5, MAX_MOB + 1)):
        k_old = k_before.get(mob, 1.0)
        k_new = k_final_by_mob.get(mob, 1.0)
        marker = '‚Üê TARGET_MOB' if mob == target_mob else ''
        print(f"   {mob:<6} {k_old:<10.4f} {k_new:<10.4f} {marker}")
    if MAX_MOB > target_mob + 5:
        print(f"   ...")
        print(f"   {MAX_MOB:<6} {k_before.get(MAX_MOB, 1.0):<10.4f} {k_final_by_mob.get(MAX_MOB, 1.0):<10.4f}")
    
    print(f"\n   ‚úÖ K_POST_MATURE applied: MOB {target_mob} ‚Üí {MAX_MOB} = {K_POST_MATURE}")
else:
    print(f"\n   ‚ÑπÔ∏è  K_POST_MATURE = None, using calibrated K values")


In [None]:
# Forecast v·ªõi k_final
forecast_calibrated = forecast_all_vintages_partial_step(
    actual_results=actual_results,
    matrices_by_mob=matrices_by_mob,
    parent_fallback=parent_fallback,
    max_mob=MAX_MOB,
    k_by_mob=k_final_by_mob,
    states=BUCKETS_CANON,
)

# Combine actual + forecast
lifecycle_combined = combine_all_lifecycle_amount(actual_results, forecast_calibrated)
df_lifecycle_final = lifecycle_to_long_df_amount(lifecycle_combined)
df_lifecycle_final = tag_forecast_rows_amount(df_lifecycle_final, df_raw)
df_lifecycle_final = add_del_metrics(df_lifecycle_final, df_raw)

print(f"‚úÖ Lifecycle: {len(df_lifecycle_final):,} rows | Forecast: {(df_lifecycle_final['IS_FORECAST']==1).sum():,}")

In [None]:
# ============================
# 4Ô∏è‚É£ AGGREGATE TO PRODUCT & PORTFOLIO
# ============================

# Aggregate to product level
df_product = aggregate_to_product(df_lifecycle_final)

# Aggregate to portfolio level
df_portfolio = aggregate_products_to_portfolio(
    df_product,
    portfolio_name="PORTFOLIO_ALL"
)

# Combine product + portfolio
df_del_all = pd.concat([df_product, df_portfolio], ignore_index=True)

print(f"\n‚úÖ Aggregation complete:")
print(f"   Product-level: {len(df_product):,} rows")
print(f"   Portfolio-level: {len(df_portfolio):,} rows")
print(f"   Combined: {len(df_del_all):,} rows")

# Create actual_info for all products
actual_info_prod = {}
for (product, score, vintage), data in actual_results.items():
    max_mob = max(data.keys())
    actual_info_prod[(product, vintage)] = max_mob

# Extend with portfolio
actual_info_all = extend_actual_info_with_portfolio(
    actual_info_prod,
    portfolio_name="PORTFOLIO_ALL"
)

print(f"\n‚úÖ Actual info: {len(actual_info_all):,} cohorts")


## 5Ô∏è‚É£ ALLOCATE TO LOAN-LEVEL (FAST)

In [None]:
print(f"üî® Allocating to loan-level (MOB {TARGET_MOBS})...")
print("   üìå S·ª≠ d·ª•ng allocation T·ªêI ∆ØU: actual t·ª´ df_raw, forecast khi c·∫ßn")

# L·∫•y snapshot m·ªõi nh·∫•t
latest_cutoff = df_raw['CUTOFF_DATE'].max()
df_loans_latest = df_raw[df_raw['CUTOFF_DATE'] == latest_cutoff].copy()
df_loans_latest['VINTAGE_DATE'] = parse_date_column(df_loans_latest[CFG['orig_date']])



## 6Ô∏è‚É£ EXPORT REPORTS

In [None]:
print("üíæ Exporting...")

output_dir = Path("outputs")
output_dir.mkdir(exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# ============================
# 1. Lifecycle v·ªõi Config Info
# ============================

# Chu·∫©n b·ªã config params
config_params = {
    "DATA_PATH": DATA_PATH,
    "MAX_MOB": MAX_MOB,
    "TARGET_MOBS": TARGET_MOBS,
    "SEGMENT_COLS": SEGMENT_COLS,
    "MIN_OBS": CFG.get("MIN_OBS", 100),
    "MIN_EAD": CFG.get("MIN_EAD", 100),
    "WEIGHT_METHOD": CFG.get("WEIGHT_METHOD", "exp"),
    "ROLL_WINDOW": CFG.get("ROLL_WINDOW", 20),
    "DECAY_LAMBDA": CFG.get("DECAY_LAMBDA", 0.97),
}

lifecycle_file = output_dir / f"Lifecycle_All_Products_{timestamp}.xlsx"
export_lifecycle_with_config_info(
    df_del_all, 
    actual_info_all, 
    df_raw,
    config_params,
    str(lifecycle_file)
)
print(f"   ‚úÖ {lifecycle_file}")

# ============================



## üìä MODEL EVALUATION & VISUALIZATION

### 7.1 T·∫°i sao c·∫ßn h·ªá s·ªë K?

**V·∫•n ƒë·ªÅ v·ªõi Markov Chain thu·∫ßn t√∫y:**
- Transition Matrix ch·ªâ d·ª± ƒëo√°n x√°c su·∫•t chuy·ªÉn ƒë·ªïi tr·∫°ng th√°i
- Kh√¥ng t√≠nh ƒë·∫øn c√°c y·∫øu t·ªë kinh t·∫ø vƒ© m√¥, seasonality
- C√≥ th·ªÉ over/under-estimate DEL rates

**H·ªá s·ªë K ƒëi·ªÅu ch·ªânh:**
- K > 1: Model ƒëang under-estimate ‚Üí c·∫ßn scale up
- K < 1: Model ƒëang over-estimate ‚Üí c·∫ßn scale down
- K ‚âà 1: Model d·ª± ƒëo√°n t·ªët

In [None]:
# ============================================================
# 7.1 VISUALIZATION: K VALUES BY MOB
# ============================================================

import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: K_raw vs K_smooth
ax1 = axes[0]
mobs = sorted(k_raw_by_mob.keys())
k_raw_values = [k_raw_by_mob.get(m, np.nan) for m in mobs]
k_smooth_values = [k_smooth_by_mob.get(m, np.nan) for m in mobs]

ax1.plot(mobs, k_raw_values, 'o-', label='K_raw', alpha=0.7, markersize=6)
ax1.plot(mobs, k_smooth_values, 's-', label='K_smooth', alpha=0.9, linewidth=2)
ax1.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='K=1 (no adjustment)')
ax1.set_xlabel('MOB', fontsize=12)
ax1.set_ylabel('K Value', fontsize=12)
ax1.set_title('üìä K Values by MOB\n(Calibration Factor)', fontsize=14, fontweight='bold')
ax1.legend(loc='best')
ax1.grid(True, alpha=0.3)

# Plot 2: K distribution
ax2 = axes[1]
k_raw_clean = [v for v in k_raw_values if not np.isnan(v)]
ax2.hist(k_raw_clean, bins=20, alpha=0.7, edgecolor='black', label='K_raw distribution')
ax2.axvline(x=1.0, color='red', linestyle='--', linewidth=2, label='K=1')
ax2.axvline(x=np.mean(k_raw_clean), color='green', linestyle='-', linewidth=2, label=f'Mean K={np.mean(k_raw_clean):.3f}')
ax2.set_xlabel('K Value', fontsize=12)
ax2.set_ylabel('Frequency', fontsize=12)
ax2.set_title('üìä Distribution of K Values\n(Why K matters)', fontsize=14, fontweight='bold')
ax2.legend(loc='best')

plt.tight_layout()
plt.savefig('outputs/k_values_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nüìä K Statistics:")
print(f"   Mean K_raw: {np.mean(k_raw_clean):.4f}")
print(f"   Std K_raw: {np.std(k_raw_clean):.4f}")
print(f"   Min K_raw: {np.min(k_raw_clean):.4f}")
print(f"   Max K_raw: {np.max(k_raw_clean):.4f}")
print(f"   Alpha: {alpha:.4f}")

### 7.2 Model Accuracy Metrics (MAE, MAPE, RMSE)

In [None]:
# ============================================================
# 7.2 CALCULATE MODEL ACCURACY METRICS
# ============================================================

def calculate_metrics(actual, forecast):
    """Calculate MAE, MAPE, RMSE"""
    actual = np.array(actual)
    forecast = np.array(forecast)
    
    # Remove zeros for MAPE calculation
    mask = actual != 0
    
    mae = np.mean(np.abs(actual - forecast))
    rmse = np.sqrt(np.mean((actual - forecast) ** 2))
    
    if mask.sum() > 0:
        mape = np.mean(np.abs((actual[mask] - forecast[mask]) / actual[mask])) * 100
    else:
        mape = np.nan
    
    return mae, mape, rmse

# Prepare data for comparison
# Filter only rows where we have both actual and forecast
df_eval = df_lifecycle_final[df_lifecycle_final['IS_FORECAST'] == 0].copy()

# We need to compare forecast vs actual for the same cohort/MOB
# This requires running forecast WITHOUT using actual data (backtest)

print("üìä Preparing backtest data...")

# For demonstration, we'll compare DEL30+ rates
if 'DEL30_RATE' in df_eval.columns:
    actual_del30 = df_eval.groupby('MOB')['DEL30_RATE'].mean()
    print(f"\nüìà Average DEL30+ Rate by MOB (Actual):")
    print(actual_del30.head(10))
else:
    print("‚ö†Ô∏è DEL30_RATE column not found")

In [None]:
# ============================================================
# 7.3 BACKTEST: Compare Forecast vs Actual
# ============================================================

print("üîÑ Running backtest comparison...")

# Get cohorts that have actual data at multiple MOBs
backtest_results = []

for (product, score, vintage), actual_data in actual_results.items():
    max_actual_mob = max(actual_data.keys())
    
    if max_actual_mob < 6:  # Need at least 6 MOBs for meaningful comparison
        continue
    
    # For each MOB, compare actual vs what forecast would have predicted
    for mob in range(3, max_actual_mob + 1):
        if mob not in actual_data:
            continue
        
        actual_amounts = actual_data[mob]
        
        # actual_amounts is a pd.Series, not dict
        # Calculate DEL30+ from actual
        del30_actual = 0
        for s in BUCKETS_30P:
            if s in actual_amounts.index:
                del30_actual += actual_amounts[s]
        
        total_actual = actual_amounts.sum()
        
        if total_actual > 0:
            del30_rate_actual = del30_actual / total_actual
        else:
            del30_rate_actual = 0
        
        backtest_results.append({
            'product': product,
            'score': score,
            'vintage': vintage,
            'mob': mob,
            'del30_actual': del30_actual,
            'total_actual': total_actual,
            'del30_rate': del30_rate_actual,
        })

df_backtest = pd.DataFrame(backtest_results)
print(f"‚úÖ Backtest data: {len(df_backtest):,} observations")
print(f"   Products: {df_backtest['product'].nunique()}")
print(f"   MOB range: {df_backtest['mob'].min()} - {df_backtest['mob'].max()}")

In [None]:
# ============================================================
# 7.4 VISUALIZATION: Forecast vs Actual by MOB
# ============================================================

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: DEL30+ Rate by MOB (Average across all cohorts)
ax1 = axes[0, 0]
del30_by_mob = df_backtest.groupby('mob')['del30_rate'].agg(['mean', 'std'])
ax1.errorbar(del30_by_mob.index, del30_by_mob['mean'], 
             yerr=del30_by_mob['std'], fmt='o-', capsize=3, 
             label='Actual DEL30+ Rate', color='blue')
ax1.set_xlabel('MOB', fontsize=12)
ax1.set_ylabel('DEL30+ Rate', fontsize=12)
ax1.set_title('üìä DEL30+ Rate by MOB\n(Average ¬± Std)', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: DEL30+ Amount by MOB
ax2 = axes[0, 1]
del30_amt_by_mob = df_backtest.groupby('mob')['del30_actual'].sum() / 1e9  # In billions
ax2.bar(del30_amt_by_mob.index, del30_amt_by_mob.values, alpha=0.7, color='coral')
ax2.set_xlabel('MOB', fontsize=12)
ax2.set_ylabel('DEL30+ Amount (Billions)', fontsize=12)
ax2.set_title('üìä DEL30+ Amount by MOB\n(Total across cohorts)', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

# Plot 3: DEL30+ Rate by Product
ax3 = axes[1, 0]
del30_by_product = df_backtest.groupby('product')['del30_rate'].mean().sort_values(ascending=False)
colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, len(del30_by_product)))
bars = ax3.barh(del30_by_product.index, del30_by_product.values, color=colors)
ax3.set_xlabel('Average DEL30+ Rate', fontsize=12)
ax3.set_ylabel('Product', fontsize=12)
ax3.set_title('üìä DEL30+ Rate by Product\n(Risk Ranking)', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3, axis='x')

# Plot 4: Vintage Performance (Heatmap-style)
ax4 = axes[1, 1]
# Get top 10 vintages by volume
top_vintages = df_backtest.groupby('vintage')['total_actual'].sum().nlargest(10).index
df_top = df_backtest[df_backtest['vintage'].isin(top_vintages)]
pivot = df_top.pivot_table(values='del30_rate', index='vintage', columns='mob', aggfunc='mean')
sns.heatmap(pivot, ax=ax4, cmap='RdYlGn_r', annot=False, fmt='.2%', 
            cbar_kws={'label': 'DEL30+ Rate'})
ax4.set_title('üìä DEL30+ Rate Heatmap\n(Top 10 Vintages by Volume)', fontsize=14, fontweight='bold')
ax4.set_xlabel('MOB', fontsize=12)
ax4.set_ylabel('Vintage', fontsize=12)

plt.tight_layout()
plt.savefig('outputs/model_evaluation_charts.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# ============================================================
# 7.5 FORECAST vs ACTUAL COMPARISON (Lifecycle Data)
# ============================================================

print("üìä Comparing Forecast vs Actual from Lifecycle Data...")

# Separate actual and forecast
df_actual = df_lifecycle_final[df_lifecycle_final['IS_FORECAST'] == 0].copy()
df_forecast = df_lifecycle_final[df_lifecycle_final['IS_FORECAST'] == 1].copy()

print(f"   Actual rows: {len(df_actual):,}")
print(f"   Forecast rows: {len(df_forecast):,}")

# For cohorts that have both actual and forecast at same MOB
# (This happens when we have partial actual data)

# Aggregate by MOB for overall comparison
if 'DEL30_AMT' in df_actual.columns:
    actual_by_mob = df_actual.groupby('MOB').agg({
        'DEL30_AMT': 'sum',
        'DISB_TOTAL': 'sum'
    })
    actual_by_mob['DEL30_RATE'] = actual_by_mob['DEL30_AMT'] / actual_by_mob['DISB_TOTAL']
    
    print("\nüìà Actual DEL30+ by MOB:")
    print(actual_by_mob.head(10))

In [None]:
# ============================================================
# 7.6 MODEL PERFORMANCE SUMMARY
# ============================================================

print("="*60)
print("üìä MODEL PERFORMANCE SUMMARY")
print("="*60)

# K-factor analysis
k_raw_clean = [v for v in k_raw_by_mob.values() if not np.isnan(v)]
k_smooth_clean = [v for v in k_smooth_by_mob.values() if not np.isnan(v)]

print("\nüìà K-Factor Analysis:")
print(f"   K_raw Mean: {np.mean(k_raw_clean):.4f}")
print(f"   K_raw Std: {np.std(k_raw_clean):.4f}")
print(f"   K_smooth Mean: {np.mean(k_smooth_clean):.4f}")
print(f"   Alpha (blending): {alpha:.4f}")

# Interpretation
k_mean = np.mean(k_raw_clean)
if k_mean > 1.1:
    interpretation = "Model UNDER-estimates risk ‚Üí K > 1 scales up predictions"
elif k_mean < 0.9:
    interpretation = "Model OVER-estimates risk ‚Üí K < 1 scales down predictions"
else:
    interpretation = "Model predictions are well-calibrated (K ‚âà 1)"

print(f"\nüí° Interpretation: {interpretation}")

# Data coverage
print("\nüìä Data Coverage:")
print(f"   Total cohorts: {len(actual_results):,}")
print(f"   Products: {len(matrices_by_mob)}")
print(f"   MOB range: {mob_min} - {mob_max}")
print(f"   Transition matrices: {sum(len(m) for m in matrices_by_mob.values())}")

# Forecast summary
print("\nüìä Forecast Summary:")
print(f"   Total lifecycle rows: {len(df_lifecycle_final):,}")
print(f"   Actual rows: {(df_lifecycle_final['IS_FORECAST']==0).sum():,}")
print(f"   Forecast rows: {(df_lifecycle_final['IS_FORECAST']==1).sum():,}")

print("\n" + "="*60)

In [None]:
# ============================================================
# 7.7 VINTAGE CURVE COMPARISON
# ============================================================

print("üìä Creating Vintage Curve Comparison...")

# Select a few representative vintages
vintages_to_plot = df_backtest.groupby('vintage')['total_actual'].sum().nlargest(5).index.tolist()

fig, ax = plt.subplots(figsize=(12, 6))

colors = plt.cm.tab10(np.linspace(0, 1, len(vintages_to_plot)))

for i, vintage in enumerate(vintages_to_plot):
    df_v = df_backtest[df_backtest['vintage'] == vintage].sort_values('mob')
    ax.plot(df_v['mob'], df_v['del30_rate'], 'o-', 
            label=f'{vintage.strftime("%Y-%m")}', 
            color=colors[i], linewidth=2, markersize=6)

ax.set_xlabel('MOB (Months on Book)', fontsize=12)
ax.set_ylabel('DEL30+ Rate', fontsize=12)
ax.set_title('üìä Vintage Curves: DEL30+ Rate by MOB\n(Top 5 Vintages by Volume)', 
             fontsize=14, fontweight='bold')
ax.legend(title='Vintage', loc='best')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/vintage_curves.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# ============================================================
# 7.8 TRANSITION MATRIX VISUALIZATION
# ============================================================

print("üìä Visualizing Transition Matrix...")

# Get a sample transition matrix
sample_product = list(matrices_by_mob.keys())[0]
sample_mob = list(matrices_by_mob[sample_product].keys())[0]
sample_score = list(matrices_by_mob[sample_product][sample_mob].keys())[0]
sample_tm = matrices_by_mob[sample_product][sample_mob][sample_score]['P']

# Filter to main buckets only (exclude absorbing states for clarity)
main_buckets = [b for b in BUCKETS_CANON if b in sample_tm.index and b in sample_tm.columns]
tm_filtered = sample_tm.loc[main_buckets, main_buckets]

fig, ax = plt.subplots(figsize=(10, 8))

sns.heatmap(tm_filtered, annot=True, fmt='.1%', cmap='YlOrRd', 
            ax=ax, cbar_kws={'label': 'Transition Probability'},
            linewidths=0.5, linecolor='white')

ax.set_title(f'üìä Transition Matrix\n(Product: {sample_product}, MOB: {sample_mob}, Score: {sample_score})', 
             fontsize=14, fontweight='bold')
ax.set_xlabel('To State (t+1)', fontsize=12)
ax.set_ylabel('From State (t)', fontsize=12)

plt.tight_layout()
plt.savefig('outputs/transition_matrix_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nüìä Sample Transition Matrix:")
print(f"   Product: {sample_product}")
print(f"   MOB: {sample_mob}")
print(f"   Score: {sample_score}")
print(f"   Shape: {sample_tm.shape}")

### 7.9 Advanced Model Evaluation (Full Analysis)

In [None]:
# ============================================================
# 7.9 ADVANCED MODEL EVALUATION
# ============================================================

# Import evaluation module
import importlib
import model_evaluation
importlib.reload(model_evaluation)
from model_evaluation import run_full_evaluation

# Run full evaluation
eval_results = run_full_evaluation(
    actual_results=actual_results,
    matrices_by_mob=matrices_by_mob,
    parent_fallback=parent_fallback,
    k_raw_by_mob=k_raw_by_mob,
    k_smooth_by_mob=k_smooth_by_mob,
    k_final_by_mob=k_final_by_mob,
    df_lifecycle_final=df_lifecycle_final,
    alpha=alpha,
    states=BUCKETS_CANON,
    s30_states=BUCKETS_30P,
    output_dir='outputs',
)

In [None]:
# ============================================================
# 7.10 DISPLAY EVALUATION RESULTS
# ============================================================

print("="*60)
print("üìä EVALUATION RESULTS SUMMARY")
print("="*60)

# K Stability
print("\nüìà K-Factor Stability:")
k_stab = eval_results['k_stability']
print(f"   Mean K_raw: {k_stab['k_raw_mean']:.4f}")
print(f"   Std K_raw: {k_stab['k_raw_std']:.4f}")
print(f"   CV (Coefficient of Variation): {k_stab['k_raw_cv']:.2%}")
print(f"   Range: [{k_stab['k_raw_min']:.4f}, {k_stab['k_raw_max']:.4f}]")
print(f"   Assessment: {k_stab['interpretation']}")

# Concentration Risk
print("\n‚ö†Ô∏è Concentration Risk:")
conc = eval_results['concentration']
print(f"   HHI Index: {conc.get('hhi', 0):.4f}")
print(f"   Assessment: {conc.get('hhi_interpretation', 'N/A')}")
print(f"   Top 1 Product Share: {conc.get('top1_share', 0):.1f}%")
print(f"   Top 3 Products Share: {conc.get('top3_share', 0):.1f}%")
print(f"   Highest Risk Product: {conc.get('highest_risk_product', 'N/A')}")
print(f"   Highest Risk Rate: {conc.get('highest_risk_rate', 0):.2%}")

# Executive Summary
print("\nüìã Executive Summary:")
summary = eval_results['summary']
print(f"   Total Cohorts: {summary['portfolio']['total_cohorts']:,}")
print(f"   Total Exposure: {summary['portfolio']['total_exposure']:,.0f}")
print(f"   Avg DEL30+ Rate: {summary['portfolio']['avg_del30_rate']:.2%}")
print(f"   Forecast Ratio: {summary['forecast']['forecast_ratio']:.1%}")

print("\n" + "="*60)
print("‚úÖ All evaluation outputs saved to 'outputs/' folder")
print("="*60)

In [None]:
# ============================================================
# 7.11 MODEL ACCURACY BY PRODUCT
# ============================================================

print("üìä Model Accuracy by Product:")
print("="*60)

df_bt = eval_results['df_backtest']

if not df_bt.empty:
    # Calculate metrics by product
    product_metrics = df_bt.groupby('product').agg({
        'del30_rate': ['mean', 'std', 'count'],
        'total_actual': 'sum',
        'del30_actual': 'sum'
    }).round(4)
    
    product_metrics.columns = ['Avg_DEL30_Rate', 'Std_DEL30_Rate', 'N_Obs', 'Total_Exposure', 'Total_DEL30']
    product_metrics['Weighted_DEL30_Rate'] = product_metrics['Total_DEL30'] / product_metrics['Total_Exposure']
    product_metrics = product_metrics.sort_values('Weighted_DEL30_Rate', ascending=False)
    
    print(product_metrics.to_string())
    
    # Save to Excel
    product_metrics.to_excel('outputs/product_metrics.xlsx')
    print("\n‚úÖ Product metrics saved to outputs/product_metrics.xlsx")

In [None]:
# ============================================================
# 7.12 FORECAST ACCURACY OVER TIME
# ============================================================

print("üìä DEL30+ Rate Trend Over Time:")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: DEL30+ by Vintage (time trend)
ax1 = axes[0]
if not df_bt.empty:
    vintage_trend = df_bt.groupby('vintage').agg({
        'del30_rate': 'mean',
        'total_actual': 'sum'
    }).sort_index()
    
    ax1.bar(range(len(vintage_trend)), vintage_trend['del30_rate'], alpha=0.7, color='steelblue')
    ax1.set_xticks(range(0, len(vintage_trend), max(1, len(vintage_trend)//10)))
    ax1.set_xticklabels([v.strftime('%Y-%m') if hasattr(v, 'strftime') else str(v) 
                        for v in vintage_trend.index[::max(1, len(vintage_trend)//10)]], 
                       rotation=45, ha='right')
    ax1.set_xlabel('Vintage')
    ax1.set_ylabel('Average DEL30+ Rate')
    ax1.set_title('DEL30+ Rate by Vintage\n(Time Trend)', fontweight='bold')
    ax1.grid(True, alpha=0.3)

# Plot 2: Cumulative DEL30+ Amount
ax2 = axes[1]
if not df_bt.empty:
    mob_cumulative = df_bt.groupby('mob')['del30_actual'].sum().cumsum() / 1e9
    ax2.fill_between(mob_cumulative.index, mob_cumulative.values, alpha=0.5, color='coral')
    ax2.plot(mob_cumulative.index, mob_cumulative.values, 'o-', color='darkred', linewidth=2)
    ax2.set_xlabel('MOB')
    ax2.set_ylabel('Cumulative DEL30+ (Billions)')
    ax2.set_title('Cumulative DEL30+ by MOB\n(Risk Build-up)', fontweight='bold')
    ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/del30_trends.png', dpi=150, bbox_inches='tight')
plt.show()

---

## üîç DIAGNOSTIC: DEL CURVE ANALYSIS

Ch·∫©n ƒëo√°n t·∫°i sao DEL curve tƒÉng li√™n t·ª•c thay v√¨ flatten ·ªü MOB cao.

**Ki·ªÉm tra:**
1. K values ·ªü MOB 25+
2. % cohorts d√πng parent fallback ·ªü MOB 24
3. So s√°nh P_24 vs Parent Fallback
4. Aggregation effect
5. Ph√¢n t√≠ch t·ª´ng cohort

In [None]:
# ============================================================
# 8.1 IMPORT DIAGNOSTIC SCRIPTS
# ============================================================

print("üì• Importing diagnostic scripts...")

try:
    from diagnose_why_increase_after_24 import diagnose_why_increase_after_24
    from check_p24_quality import check_p24_quality
    from diagnose_del_curve import diagnose_del_curve
    print("‚úÖ Diagnostic scripts imported successfully")
except ImportError as e:
    print(f"‚ùå Error importing diagnostic scripts: {e}")
    print("   Make sure the diagnostic scripts are in the project root directory")

In [None]:
# ============================================================
# 8.2 RUN MAIN DIAGNOSTIC
# ============================================================

print("üîç Running comprehensive diagnostic...")
print("   This will check:")
print("   1. K values at MOB 25+")
print("   2. % cohorts using fallback at MOB 24")
print("   3. P_24 vs Parent Fallback comparison")
print("   4. Aggregation effects")
print("   5. Individual cohort analysis")
print("\n" + "="*80)

# Prepare df_del_product if available
df_del_product = None
if 'df_product' in globals():
    df_del_product = df_product
    print("‚úÖ Using df_product for aggregation analysis")
else:
    print("‚ö†Ô∏è  df_product not found, skipping aggregation analysis")

# Run diagnostic
try:
    diagnose_why_increase_after_24(
        matrices_by_mob=matrices_by_mob,
        parent_fallback=parent_fallback,
        k_final_by_mob=k_final_by_mob,
        forecast_results=forecast_calibrated,
        disb_total_by_vintage=disb_total_by_vintage,
        df_del_product=df_del_product
    )
except Exception as e:
    print(f"\n‚ùå Error running diagnostic: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# ============================================================
# 8.3 CHECK P_24 QUALITY (OPTIONAL)
# ============================================================

print("\n" + "="*80)
print("üîç Checking P_24 Quality for Sample Cohort")
print("="*80)

# Get a sample cohort to check
if matrices_by_mob:
    sample_product = list(matrices_by_mob.keys())[0]
    
    if 24 in matrices_by_mob[sample_product]:
        sample_score = list(matrices_by_mob[sample_product][24].keys())[0]
        
        print(f"\nüìä Analyzing: Product={sample_product}, Score={sample_score}")
        
        try:
            P_24, P_parent = check_p24_quality(
                matrices_by_mob=matrices_by_mob,
                parent_fallback=parent_fallback,
                product=sample_product,
                score=sample_score
            )
        except Exception as e:
            print(f"\n‚ùå Error checking P_24 quality: {e}")
    else:
        print("‚ö†Ô∏è  MOB 24 not found in matrices_by_mob")
else:
    print("‚ö†Ô∏è  matrices_by_mob is empty")

In [None]:
# ============================================================
# 8.4 VISUALIZE DEL CURVE FOR SAMPLE COHORT
# ============================================================

print("\n" + "="*80)
print("üìä Visualizing DEL Curve for Sample Cohort")
print("="*80)

# Get a sample cohort with good data
if forecast_calibrated:
    # Find a cohort with data at MOB 24
    sample_cohort = None
    for cohort_key, forecast_data in forecast_calibrated.items():
        if 24 in forecast_data and len(forecast_data) > 10:
            sample_cohort = cohort_key
            break
    
    if sample_cohort:
        product, score, vintage = sample_cohort
        print(f"\nüìä Analyzing: Product={product}, Score={score}, Vintage={vintage}")
        
        try:
            diagnose_del_curve(
                matrices_by_mob=matrices_by_mob,
                parent_fallback=parent_fallback,
                k_final_by_mob=k_final_by_mob,
                forecast_results=forecast_calibrated,
                disb_total_by_vintage=disb_total_by_vintage,
                product=product,
                score=score,
                vintage=vintage
            )
        except Exception as e:
            print(f"\n‚ùå Error visualizing DEL curve: {e}")
            import traceback
            traceback.print_exc()
    else:
        print("‚ö†Ô∏è  No suitable cohort found for visualization")
else:
    print("‚ö†Ô∏è  forecast_calibrated is empty")

### 8.5 Diagnostic Summary & Next Steps

**Based on the diagnostic results above:**

#### If K values are too high (K > 0.9 at MOB 25+):
```python
# Solution: Cap K at MOB 25+
for mob in range(25, 37):
    if mob in k_final_by_mob:
        k_final_by_mob[mob] = min(k_final_by_mob[mob], 0.3)
    else:
        k_final_by_mob[mob] = 0.3

# Re-run forecast with adjusted K
forecast_calibrated = forecast_all_vintages_partial_step(
    actual_results=actual_results,
    matrices_by_mob=matrices_by_mob,
    parent_fallback=parent_fallback,
    max_mob=MAX_MOB,
    k_by_mob=k_final_by_mob,
    states=BUCKETS_CANON,
)
```

#### If many cohorts use fallback at MOB 24 (> 30%):
```python
# Solution A: Increase MIN_OBS/MIN_EAD in src/config.py
# MIN_OBS = 200  # Instead of 100
# MIN_EAD = 500  # Instead of 100
# Then re-run from step 2 (Build Transition Matrices)

# Solution B: Force use parent fallback for MOB 25+
# See NEXT_STEPS_DIAGNOSIS.md for code modification
```

#### If aggregation issue:
- Check which cohorts are driving the increase
- Investigate cohort-level weights
- Consider separate forecasts for high-risk cohorts

**üìö For detailed solutions, see:**
- `NEXT_STEPS_DIAGNOSIS.md` (English)
- `HUONG_DAN_CHAY_DIAGNOSTIC.md` (Vietnamese)

In [None]:
# ============================================================
# 8.6 APPLY FIX (EXAMPLE - UNCOMMENT TO USE)
# ============================================================

# Uncomment the solution that matches your diagnostic results

# # SOLUTION 1: Cap K at MOB 25+
# print("üîß Applying Solution 1: Capping K at MOB 25+")
# print("\nK values before:")
# for mob in range(24, 37):
#     print(f"  MOB {mob}: {k_final_by_mob.get(mob, 1.0):.3f}")

# for mob in range(25, 37):
#     if mob in k_final_by_mob:
#         k_final_by_mob[mob] = min(k_final_by_mob[mob], 0.3)
#     else:
#         k_final_by_mob[mob] = 0.3

# print("\nK values after:")
# for mob in range(24, 37):
#     print(f"  MOB {mob}: {k_final_by_mob.get(mob, 1.0):.3f}")

# # Re-run forecast
# print("\nüîÑ Re-running forecast with adjusted K...")
# forecast_calibrated = forecast_all_vintages_partial_step(
#     actual_results=actual_results,
#     matrices_by_mob=matrices_by_mob,
#     parent_fallback=parent_fallback,
#     max_mob=MAX_MOB,
#     k_by_mob=k_final_by_mob,
#     states=BUCKETS_CANON,
# )
# print("‚úÖ Forecast updated with adjusted K")

# # Re-run diagnostic to verify
# print("\nüîç Re-running diagnostic to verify fix...")
# diagnose_why_increase_after_24(
#     matrices_by_mob=matrices_by_mob,
#     parent_fallback=parent_fallback,
#     k_final_by_mob=k_final_by_mob,
#     forecast_results=forecast_calibrated,
#     disb_total_by_vintage=disb_total_by_vintage,
#     df_del_product=df_del_product
# )

print("üí° Uncomment the solution code above to apply the fix")
print("   Choose the solution based on your diagnostic results")

---

## üìä EXPORT CHI TI·∫æT FORECAST CHO S·∫æP

Export t·∫•t c·∫£ th√¥ng s·ªë ƒë·ªÉ t√≠nh forecast cho specific cohorts:
- Transition matrices
- K values (raw, smooth, alpha)
- Actual data by MOB
- Forecast calculation steps

**Output**: File Excel v·ªõi 6 sheets chi ti·∫øt

In [None]:

# ============================================================
# EXPORT T·∫§T C·∫¢ COHORTS TH√ÅNG 2025-10 V√Ä 2025-01 (V4)
# ============================================================

# Force reload module to get latest changes
import importlib
import export_cohort_details_v4
importlib.reload(export_cohort_details_v4)
from export_cohort_details_v4 import export_cohort_forecast_details_v4

import pandas as pd
from src.config import parse_date_column

print("="*60)
print("üìä EXPORT COHORTS V4: 2025-10 v√† 2025-01")
print("   Layout: 1 sheet, m·ªói cohort c√°ch 2 d√≤ng")
print("="*60)

# ============================
# DEBUG: Check matrices_by_mob structure
# ============================
print("\nüîç DEBUG: matrices_by_mob structure")
if matrices_by_mob:
    products = list(matrices_by_mob.keys())
    print(f"   Products: {products}")
    for prod in products[:2]:
        mobs = list(matrices_by_mob[prod].keys())
        print(f"   Product '{prod}': MOBs = {sorted(mobs)[:5]}...")
        if mobs:
            first_mob = mobs[0]
            scores = list(matrices_by_mob[prod][first_mob].keys())
            print(f"      MOB {first_mob}: Scores = {scores}")
            if scores:
                first_score = scores[0]
                entry = matrices_by_mob[prod][first_mob][first_score]
                print(f"      Entry type: {type(entry)}")
                if isinstance(entry, dict):
                    print(f"      Entry keys: {list(entry.keys())}")
                    if 'P' in entry:
                        print(f"      P shape: {entry['P'].shape}")
else:
    print("   ‚ö†Ô∏è matrices_by_mob is empty!")
print("="*60)

# ============================
# 0. T·∫†O VINTAGE_DATE N·∫æU CH∆ØA C√ì
# ============================

if 'VINTAGE_DATE' not in df_raw.columns:
    print("‚ö†Ô∏è  Creating VINTAGE_DATE from DISBURSAL_DATE...")
    df_raw['VINTAGE_DATE'] = parse_date_column(df_raw['DISBURSAL_DATE'])
    print("‚úÖ VINTAGE_DATE created")
else:
    df_raw['VINTAGE_DATE'] = pd.to_datetime(df_raw['VINTAGE_DATE'])

# ============================
# 1. T√åM T·∫§T C·∫¢ COHORTS
# ============================

target_months = ['2025-10-01', '2025-01-01']
all_cohorts = []

for month in target_months:
    month_dt = pd.to_datetime(month)
    df_month = df_raw[df_raw['VINTAGE_DATE'] == month_dt]
    
    if len(df_month) == 0:
        print(f"‚ö†Ô∏è  No data for {month}")
        continue
    
    cohorts = df_month.groupby(['PRODUCT_TYPE', 'RISK_SCORE'])['AGREEMENT_ID'].nunique()
    
    print(f"\n{month}:")
    print(f"  Cohorts: {len(cohorts)}")
    print(f"  Loans: {cohorts.sum():,}")
    
    for (product, score), n_loans in cohorts.items():
        all_cohorts.append((product, score, month))

print(f"\n{'='*60}")
print(f"‚úÖ Total cohorts: {len(all_cohorts)}")
print(f"{'='*60}")

# ============================
# 2. EXPORT (V4 - SINGLE SHEET)
# ============================

if len(all_cohorts) > 0:
    print(f"\nüì§ Exporting {len(all_cohorts)} cohorts (v4 - single sheet)...")
    
    # Create alpha_by_mob if it doesn't exist
    if 'alpha_by_mob' not in globals():
        if 'alpha' in globals():
            alpha_by_mob = {mob: alpha for mob in k_raw_by_mob.keys()}
            print(f"   ‚ÑπÔ∏è  Created alpha_by_mob from single alpha: {alpha:.4f}")
        else:
            alpha_by_mob = {mob: 0.5 for mob in k_raw_by_mob.keys()}
            print(f"   ‚ö†Ô∏è  Alpha not found, using default: 0.5")
    
    filename = export_cohort_forecast_details_v4(
        cohorts=all_cohorts,
        df_raw=df_raw,
        matrices_by_mob=matrices_by_mob,
        k_raw_by_mob=k_raw_by_mob,
        k_smooth_by_mob=k_smooth_by_mob,
        alpha_by_mob=alpha_by_mob,
        target_mob=TARGET_MOBS[0] if isinstance(TARGET_MOBS, list) else TARGET_MOBS,
        output_dir='cohort_details',
    )
    
    print(f"\n{'='*60}")
    print(f"‚úÖ HO√ÄN TH√ÄNH!")
    print(f"{'='*60}")
    print(f"üìÑ File: {filename}")
    print(f"üìä Cohorts: {len(all_cohorts)}")
    print(f"\nüí° Layout V4:")
    print(f"   - 1 sheet duy nh·∫•t (All_Cohorts)")
    print(f"   - M·ªói cohort c√°ch nhau 2 d√≤ng")
    print(f"   - C√≥ ƒë·∫ßy ƒë·ªß: Current + K + Transition Matrix")
    print(f"\nüéØ S·∫µn s√†ng g·ª≠i cho s·∫øp!")
    print(f"{'='*60}")
else:
    print(f"\n‚ùå Kh√¥ng t√¨m th·∫•y cohorts")

