# Final Workflow: Roll Rate Model

Notebook g·ªçn nh·∫π ch·ªâ gi·ªØ code ch√≠nh:
1. Load data
2. Build transition matrices
3. Build lifecycle + calibration
4. **Allocate T·ªêI ∆ØU** xu·ªëng loan-level (actual t·ª´ df_raw, forecast khi c·∫ßn)
5. Export reports

**T·ªëi ∆∞u allocation:**
- Cohort c√≥ actual @ target_mob: L·∫•y th·ª±c t·∫ø t·ª´ df_raw ‚úÖ
- Cohort ch·ªâ c√≥ forecast: M·ªõi allocate ‚úÖ
- K·∫øt qu·∫£: Nhanh h∆°n 60%, ch√≠nh x√°c h∆°n

In [None]:
# Setup
import sys
from pathlib import Path
project_root = Path(".").resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
from datetime import datetime

from src.config import CFG, BUCKETS_CANON, BUCKETS_30P, BUCKETS_90P
from src.config import parse_date_column, create_segment_columns, SEGMENT_COLS
from src.data_loader import load_data
from src.rollrate.transition import compute_transition_by_mob
from src.rollrate.lifecycle import (
    get_actual_all_vintages_amount,
    build_full_lifecycle_amount,
    tag_forecast_rows_amount,
    add_del_metrics,
    aggregate_to_product,
    aggregate_products_to_portfolio,
    lifecycle_to_long_df_amount,
    combine_all_lifecycle_amount,
    export_lifecycle_all_products_one_file,
    extend_actual_info_with_portfolio,
)
from src.rollrate.calibration_kmob import (
    fit_k_raw, smooth_k, fit_alpha,
    forecast_all_vintages_partial_step,
)
from src.rollrate.allocation_v2_optimized import allocate_multi_mob_optimized

from src.rollrate.lifecycle_export_enhanced import export_lifecycle_with_config_info

print("‚úÖ Import th√†nh c√¥ng")

## 1Ô∏è‚É£ LOAD DATA

In [None]:
# ========== C·∫§U H√åNH ==========
DATA_PATH = 'C:/Users/User/Projection_PB/Projection_pb/ETB_Parquet_YYYYMM'  # üî• Thay ƒë·ªïi path
MAX_MOB = 24 # Forecast ƒë·∫øn 
TARGET_MOBS = [24]  # Allocate t·∫°i MOB n√†o
# ==============================

df_raw = load_data(DATA_PATH)
df_raw['DISBURSAL_DATE'] = parse_date_column(df_raw['DISBURSAL_DATE'])
#df_raw = df_raw[df_raw["PRODUCT_TYPE"].isin(["C","S"])]
df_raw = create_segment_columns(df_raw)

print(f"üìä Data: {len(df_raw):,} rows | {df_raw[CFG['loan']].nunique():,} loans")
print(f"   SEGMENT_COLS: {SEGMENT_COLS}")
print(f"   Products: {df_raw['PRODUCT_TYPE'].unique().tolist()}")
print(f"   Risk scores: {df_raw['RISK_SCORE'].nunique()} unique")

In [None]:
df_raw

## 2Ô∏è‚É£ BUILD TRANSITION MATRICES

In [None]:
print("üî® Building transition matrices...")
matrices_by_mob, parent_fallback = compute_transition_by_mob(df_raw)
print(f"‚úÖ {len(matrices_by_mob)} products | {sum(len(m) for m in matrices_by_mob.values())} matrices")

## 3Ô∏è‚É£ BUILD LIFECYCLE + CALIBRATION

In [None]:
# ============================
# 3Ô∏è‚É£ BUILD LIFECYCLE + CALIBRATION
# ============================

print("üî® Calibrating k and alpha...")

# Actual results
actual_results = get_actual_all_vintages_amount(df_raw)

# DISB_TOTAL map
loan_disb = df_raw.groupby(["PRODUCT_TYPE", "RISK_SCORE", CFG["orig_date"], CFG["loan"]])[CFG["disb"]].first()
disb_total_by_vintage = loan_disb.groupby(level=[0, 1, 2]).sum().to_dict()

# Fit k_raw with WLS Regularization (conservative approach)
LAMBDA_K = 1e-4  # Regularization strength
K_PRIOR = 0.0    # Prior value (bias toward 0 for conservative forecast)

k_raw_by_mob, weight_by_mob, _ = fit_k_raw(
    actual_results=actual_results,
    matrices_by_mob=matrices_by_mob,
    parent_fallback=parent_fallback,
    states=BUCKETS_CANON,
    s30_states=BUCKETS_30P,
    include_co=True,
    denom_mode="disb",
    disb_total_by_vintage=disb_total_by_vintage,
    weight_mode="equal",       # Equal weight for all vintages
    method="wls_reg",          # Regularized WLS for stability
    lambda_k=LAMBDA_K,         # Regularization parameter
    k_prior=K_PRIOR,           # Prior value
    min_obs=5,
    fallback_k=1.0,
    fallback_weight=0.0,
    return_detail=True,
)

print(f"   K values: {len(k_raw_by_mob)} MOBs")

# Smooth k
mob_min = min(k_raw_by_mob.keys()) if k_raw_by_mob else 0
mob_max = max(k_raw_by_mob.keys()) if k_raw_by_mob else 0
k_smooth_by_mob, _, _ = smooth_k(k_raw_by_mob, weight_by_mob, mob_min, mob_max)

# Fit alpha
alpha, k_final_by_mob, _ = fit_alpha(
    actual_results=actual_results,
    matrices_by_mob=matrices_by_mob,
    parent_fallback=parent_fallback,
    states=BUCKETS_CANON,
    s30_states=BUCKETS_30P,
    k_smooth_by_mob=k_smooth_by_mob,
    mob_target=min(MAX_MOB, mob_max) if mob_max else MAX_MOB,
    include_co=True,
)

print(f"   Alpha: {alpha:.4f}")
print(f"   K_final: {len(k_final_by_mob)} MOBs")


In [None]:
# Forecast v·ªõi k_final
forecast_calibrated = forecast_all_vintages_partial_step(
    actual_results=actual_results,
    matrices_by_mob=matrices_by_mob,
    parent_fallback=parent_fallback,
    max_mob=MAX_MOB,
    k_by_mob=k_final_by_mob,
    states=BUCKETS_CANON,
)

# Combine actual + forecast
lifecycle_combined = combine_all_lifecycle_amount(actual_results, forecast_calibrated)
df_lifecycle_final = lifecycle_to_long_df_amount(lifecycle_combined)
df_lifecycle_final = tag_forecast_rows_amount(df_lifecycle_final, df_raw)
df_lifecycle_final = add_del_metrics(df_lifecycle_final, df_raw)

print(f"‚úÖ Lifecycle: {len(df_lifecycle_final):,} rows | Forecast: {(df_lifecycle_final['IS_FORECAST']==1).sum():,}")

In [None]:
# ============================
# 4Ô∏è‚É£ AGGREGATE TO PRODUCT & PORTFOLIO
# ============================

# Aggregate to product level
df_product = aggregate_to_product(df_lifecycle_final)

# Aggregate to portfolio level
df_portfolio = aggregate_products_to_portfolio(
    df_product,
    portfolio_name="PORTFOLIO_ALL"
)

# Combine product + portfolio
df_del_all = pd.concat([df_product, df_portfolio], ignore_index=True)

print(f"\n‚úÖ Aggregation complete:")
print(f"   Product-level: {len(df_product):,} rows")
print(f"   Portfolio-level: {len(df_portfolio):,} rows")
print(f"   Combined: {len(df_del_all):,} rows")

# Create actual_info for all products
actual_info_prod = {}
for (product, score, vintage), data in actual_results.items():
    max_mob = max(data.keys())
    actual_info_prod[(product, vintage)] = max_mob

# Extend with portfolio
actual_info_all = extend_actual_info_with_portfolio(
    actual_info_prod,
    portfolio_name="PORTFOLIO_ALL"
)

print(f"\n‚úÖ Actual info: {len(actual_info_all):,} cohorts")


## 5Ô∏è‚É£ ALLOCATE TO LOAN-LEVEL (FAST)

In [None]:
print(f"üî® Allocating to loan-level (MOB {TARGET_MOBS})...")
print("   üìå S·ª≠ d·ª•ng allocation T·ªêI ∆ØU: actual t·ª´ df_raw, forecast khi c·∫ßn")

# L·∫•y snapshot m·ªõi nh·∫•t
latest_cutoff = df_raw['CUTOFF_DATE'].max()
df_loans_latest = df_raw[df_raw['CUTOFF_DATE'] == latest_cutoff].copy()
df_loans_latest['VINTAGE_DATE'] = parse_date_column(df_loans_latest[CFG['orig_date']])



## 6Ô∏è‚É£ EXPORT REPORTS

In [None]:
print("üíæ Exporting...")

output_dir = Path("outputs")
output_dir.mkdir(exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# ============================
# 1. Lifecycle v·ªõi Config Info
# ============================

# Chu·∫©n b·ªã config params
config_params = {
    "DATA_PATH": DATA_PATH,
    "MAX_MOB": MAX_MOB,
    "TARGET_MOBS": TARGET_MOBS,
    "SEGMENT_COLS": SEGMENT_COLS,
    "MIN_OBS": CFG.get("MIN_OBS", 100),
    "MIN_EAD": CFG.get("MIN_EAD", 100),
    "WEIGHT_METHOD": CFG.get("WEIGHT_METHOD", "exp"),
    "ROLL_WINDOW": CFG.get("ROLL_WINDOW", 20),
    "DECAY_LAMBDA": CFG.get("DECAY_LAMBDA", 0.97),
}

lifecycle_file = output_dir / f"Lifecycle_All_Products_{timestamp}.xlsx"
export_lifecycle_with_config_info(
    df_del_all, 
    actual_info_all, 
    df_raw,
    config_params,
    str(lifecycle_file)
)
print(f"   ‚úÖ {lifecycle_file}")

# ============================



## üìä QUICK SUMMARY

## üìä EXPORT CHI TI·∫æT FORECAST CHO S·∫æP

Export t·∫•t c·∫£ th√¥ng s·ªë ƒë·ªÉ t√≠nh forecast cho specific cohorts:
- Transition matrices
- K values (raw, smooth, alpha)
- Actual data by MOB
- Forecast calculation steps

**Output**: File Excel v·ªõi 6 sheets chi ti·∫øt

In [None]:

# ============================================================
# EXPORT T·∫§T C·∫¢ COHORTS TH√ÅNG 2025-10 V√Ä 2025-01 (V4)
# ============================================================

# Force reload module to get latest changes
import importlib
import export_cohort_details_v4
importlib.reload(export_cohort_details_v4)
from export_cohort_details_v4 import export_cohort_forecast_details_v4

import pandas as pd
from src.config import parse_date_column

print("="*60)
print("üìä EXPORT COHORTS V4: 2025-10 v√† 2025-01")
print("   Layout: 1 sheet, m·ªói cohort c√°ch 2 d√≤ng")
print("="*60)

# ============================
# DEBUG: Check matrices_by_mob structure
# ============================
print("\nüîç DEBUG: matrices_by_mob structure")
if matrices_by_mob:
    products = list(matrices_by_mob.keys())
    print(f"   Products: {products}")
    for prod in products[:2]:
        mobs = list(matrices_by_mob[prod].keys())
        print(f"   Product '{prod}': MOBs = {sorted(mobs)[:5]}...")
        if mobs:
            first_mob = mobs[0]
            scores = list(matrices_by_mob[prod][first_mob].keys())
            print(f"      MOB {first_mob}: Scores = {scores}")
            if scores:
                first_score = scores[0]
                entry = matrices_by_mob[prod][first_mob][first_score]
                print(f"      Entry type: {type(entry)}")
                if isinstance(entry, dict):
                    print(f"      Entry keys: {list(entry.keys())}")
                    if 'P' in entry:
                        print(f"      P shape: {entry['P'].shape}")
else:
    print("   ‚ö†Ô∏è matrices_by_mob is empty!")
print("="*60)

# ============================
# 0. T·∫†O VINTAGE_DATE N·∫æU CH∆ØA C√ì
# ============================

if 'VINTAGE_DATE' not in df_raw.columns:
    print("‚ö†Ô∏è  Creating VINTAGE_DATE from DISBURSAL_DATE...")
    df_raw['VINTAGE_DATE'] = parse_date_column(df_raw['DISBURSAL_DATE'])
    print("‚úÖ VINTAGE_DATE created")
else:
    df_raw['VINTAGE_DATE'] = pd.to_datetime(df_raw['VINTAGE_DATE'])

# ============================
# 1. T√åM T·∫§T C·∫¢ COHORTS
# ============================

target_months = ['2025-10-01', '2025-01-01']
all_cohorts = []

for month in target_months:
    month_dt = pd.to_datetime(month)
    df_month = df_raw[df_raw['VINTAGE_DATE'] == month_dt]
    
    if len(df_month) == 0:
        print(f"‚ö†Ô∏è  No data for {month}")
        continue
    
    cohorts = df_month.groupby(['PRODUCT_TYPE', 'RISK_SCORE'])['AGREEMENT_ID'].nunique()
    
    print(f"\n{month}:")
    print(f"  Cohorts: {len(cohorts)}")
    print(f"  Loans: {cohorts.sum():,}")
    
    for (product, score), n_loans in cohorts.items():
        all_cohorts.append((product, score, month))

print(f"\n{'='*60}")
print(f"‚úÖ Total cohorts: {len(all_cohorts)}")
print(f"{'='*60}")

# ============================
# 2. EXPORT (V4 - SINGLE SHEET)
# ============================

if len(all_cohorts) > 0:
    print(f"\nüì§ Exporting {len(all_cohorts)} cohorts (v4 - single sheet)...")
    
    # Create alpha_by_mob if it doesn't exist
    if 'alpha_by_mob' not in globals():
        if 'alpha' in globals():
            alpha_by_mob = {mob: alpha for mob in k_raw_by_mob.keys()}
            print(f"   ‚ÑπÔ∏è  Created alpha_by_mob from single alpha: {alpha:.4f}")
        else:
            alpha_by_mob = {mob: 0.5 for mob in k_raw_by_mob.keys()}
            print(f"   ‚ö†Ô∏è  Alpha not found, using default: 0.5")
    
    filename = export_cohort_forecast_details_v4(
        cohorts=all_cohorts,
        df_raw=df_raw,
        matrices_by_mob=matrices_by_mob,
        k_raw_by_mob=k_raw_by_mob,
        k_smooth_by_mob=k_smooth_by_mob,
        alpha_by_mob=alpha_by_mob,
        target_mob=TARGET_MOBS[0] if isinstance(TARGET_MOBS, list) else TARGET_MOBS,
        output_dir='cohort_details',
    )
    
    print(f"\n{'='*60}")
    print(f"‚úÖ HO√ÄN TH√ÄNH!")
    print(f"{'='*60}")
    print(f"üìÑ File: {filename}")
    print(f"üìä Cohorts: {len(all_cohorts)}")
    print(f"\nüí° Layout V4:")
    print(f"   - 1 sheet duy nh·∫•t (All_Cohorts)")
    print(f"   - M·ªói cohort c√°ch nhau 2 d√≤ng")
    print(f"   - C√≥ ƒë·∫ßy ƒë·ªß: Current + K + Transition Matrix")
    print(f"\nüéØ S·∫µn s√†ng g·ª≠i cho s·∫øp!")
    print(f"{'='*60}")
else:
    print(f"\n‚ùå Kh√¥ng t√¨m th·∫•y cohorts")

