# Risk Markov Projection — Interactive Notebook
Configure, run, and inspect the projection pipeline without changing the library code.

**Usage tips**
- Edit the config overrides in the cells below instead of hard-coding in modules.
- Choose `parquet` or `oracle` as the data source.
- Optional: generate a synthetic parquet sample if you do not have input data handy.
- Outputs (CSV + Parquet) are written to the directory you set in the notebook.

In [None]:
# Environment setup: locate project root that contains config.py and src/
import sys
from pathlib import Path

def find_project_root(start: Path, marker: str = "config.py", max_depth: int = 7) -> Path:
    """Search current and parent directories (and their risk_markov_projection child) for marker and src/."""
    candidates = []
    current = start
    for _ in range(max_depth):
        candidates.append(current)
        candidates.append(current / "risk_markov_projection")
        current = current.parent
    for cand in candidates:
        if (cand / marker).exists() and (cand / "src").exists():
            return cand
    raise FileNotFoundError(f"Could not find project root from {start}")

CWD = Path.cwd().resolve()
ROOT = find_project_root(CWD)
SRC = ROOT / "src"
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
print("Detected project root:", ROOT)
print("Src path added:", SRC)
print("Working directory:", CWD)

In [None]:
# Imports (force-load project config to avoid name conflicts)
import importlib
import importlib.util
from importlib.machinery import SourceFileLoader
from pathlib import Path
import pandas as pd
import numpy as np

# Explicitly load config.py from project root and register in sys.modules
config_path = ROOT / "config.py"
spec = importlib.util.spec_from_file_location("config", config_path)
config = importlib.util.module_from_spec(spec)
sys.modules["config"] = config
assert spec.loader is not None
spec.loader.exec_module(config)

from src.data.data_loader import load_raw_data
from src.data.schema import default_schema
from src.data.validators import validate_input
from src.pipelines.run_projection import run
from src.utils.logger import get_logger

logger = get_logger("notebook")

## Configure runtime parameters
Adjust the variables below to control data source, thresholds, and output paths at runtime.

In [None]:
# Choose data source and paths
DATA_SOURCE = "parquet"  # options: "parquet" or "oracle"
PARQUET_PATH = Path(r"C:/Users/MAFC4709/Python_work/Projection/data/parquet/POS")
GENERATE_SYNTHETIC = False  # set True to create a sample parquet if you have no data

# Oracle SQL overrides (used only when DATA_SOURCE == "oracle")
config.ORACLE_CONFIG["sql"] = config.ORACLE_CONFIG.get("sql", "SELECT * FROM RISK_MARKOV_VIEW")
config.ORACLE_CONFIG["params"] = config.ORACLE_CONFIG.get("params", {})
config.ORACLE_CONFIG["sql_dir"] = config.ORACLE_CONFIG.get("sql_dir", "sql")

# Thresholds and other runtime knobs
config.MIN_OBS = 50
config.MIN_EAD = 100.0
config.MAX_MOB = 24
config.CALIBRATION["enabled"] = True

# Output overrides (keeps notebook outputs separate)
config.OUTPUT["dir"] = ROOT / "outputs" / "notebook"
config.OUTPUT["csv_name"] = "projection_notebook.csv"
config.OUTPUT["parquet_name"] = "projection_notebook.parquet"

# Apply data source choice
config.DATA_SOURCE = DATA_SOURCE
config.PARQUET_PATH = PARQUET_PATH

config.DATA_SOURCE

## Optional: generate a synthetic parquet sample
Use this if you want a quick run without connecting to Oracle or preparing data.

In [None]:
def make_synthetic_dataset(num_loans_per_segment: int = 10, max_mob: int = 6) -> pd.DataFrame:
    records = []
    date_base = pd.Timestamp("2024-01-31")
    segments = [("A", "P1"), ("A", "P2"), ("B", "P1"), ("B", "P2")]
    state_cycle = ["CURRENT", "CURRENT", "DPD30+", "DPD60+", "DPD90+", "WRITEOFF", "WRITEOFF"]

    for risk_score, product in segments:
        for i in range(num_loans_per_segment):
            agreement_id = f"{risk_score}{product}{i}"
            ead0 = 1000 + 50 * i
            for mob in range(max_mob + 1):
                state = state_cycle[min(mob, len(state_cycle) - 1)]
                if mob == max_mob and i % 3 == 1:
                    state = "CLOSED"
                cutoff_date = date_base + pd.DateOffset(months=mob)
                ead_value = max(ead0 - mob * 25, 50)
                records.append(
                    {
                        "AGREEMENT_ID": agreement_id,
                        "MOB": mob,
                        "STATE_MODEL": state,
                        "PRINCIPLE_OUTSTANDING": float(ead_value),
                        "CUTOFF_DATE": cutoff_date,
                        "RISK_SCORE": risk_score,
                        "PRODUCT_TYPE": product,
                    }
                )
    return pd.DataFrame(records)


if DATA_SOURCE == "parquet" and GENERATE_SYNTHETIC:
    PARQUET_PATH.mkdir(parents=True, exist_ok=True)
    sample_df = make_synthetic_dataset(num_loans_per_segment=8, max_mob=8)
    sample_file = PARQUET_PATH / "sample.parquet"
    sample_df.to_parquet(sample_file, index=False)
    print("Synthetic parquet created at", sample_file)
    print(sample_df.head())
else:
    print("Synthetic generation skipped; using existing parquet files at", PARQUET_PATH)

## Load and validate data

In [None]:
schema = default_schema()
raw_df = load_raw_data(schema=schema, source=DATA_SOURCE, parquet_path=PARQUET_PATH)
validate_input(raw_df, schema=schema, max_mob=config.MAX_MOB)
raw_df.head()

## Run projection pipeline
Use the library runner to build transitions, project EAD, apply calibration, and write outputs.

In [None]:
projection_df = run(
    asof_date="2024-12-31",
    target_mob=config.MAX_MOB,
    source=DATA_SOURCE,
    parquet_path=PARQUET_PATH,
)
print("Projection shape:", projection_df.shape)
projection_df.head()

## Inspect outputs
- EAD per state, distribution over EAD0, delinquency indicators
- Audit columns: matrix_source, mob_used, n_obs_used, ead_sum_used
- Calibration factor (if enabled)

In [None]:
state_cols = [col for col in projection_df.columns if col.startswith("EAD_") and col != "EAD0"]
indicator_cols = [col for col in projection_df.columns if col.startswith("DEL_")]
audit_cols = ["matrix_source", "mob_used", "n_obs_used", "ead_sum_used", "calibration_factor"]
display(projection_df[state_cols + indicator_cols + audit_cols].head())

# Fallback coverage
fallback_rate = (projection_df["matrix_source"] != "segment_mob").mean()
print(f"Fallback usage: {fallback_rate*100:.2f}%")

## Visualize delinquency over MOB

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

segment = projection_df[["RISK_SCORE", "PRODUCT_TYPE"]].drop_duplicates().iloc[0]
mask = (projection_df["RISK_SCORE"] == segment["RISK_SCORE"]) & (projection_df["PRODUCT_TYPE"] == segment["PRODUCT_TYPE"])
subset = projection_df.loc[mask]

plt.figure(figsize=(8, 4))
plt.plot(subset["MOB"], subset["DEL_30P_ON_EAD0"], label="DEL_30P_ON_EAD0")
plt.plot(subset["MOB"], subset["DEL_60P_ON_EAD0"], label="DEL_60P_ON_EAD0")
plt.plot(subset["MOB"], subset["DEL_90P_ON_EAD0"], label="DEL_90P_ON_EAD0")
plt.xlabel("MOB")
plt.ylabel("Ratio over EAD0")
plt.title(f"Delinquency trajectory for segment {segment['RISK_SCORE']} / {segment['PRODUCT_TYPE']}")
plt.legend()
plt.grid(True)
plt.show()