# OpenFinGuard — Label Generation (DPD)

Week: 4  
Day: 2  
Objective:
- Compute Days Past Due (DPD)
- Create late_30 / late_60 / late_90 outcomes
- Preserve temporal integrity


Load required tables

In [1]:
import pandas as pd
from datetime import timedelta

cycles = pd.read_csv(
    "../data/raw/billing_cycles.csv",
    parse_dates=["cycle_start_date", "cycle_end_date", "due_date"]
)

payments = pd.read_csv(
    "../data/raw/payments.csv",
    parse_dates=["payment_date"]
)
cycles.shape, payments.shape



((180, 8), (180, 5))

# Compute DPD
DPD definition:
DPD = max(0, payment_date − due_date)

In [2]:
labels = cycles.merge(
    payments[["billing_cycle_id", "payment_date"]],
    on="billing_cycle_id",
    how="left"
)

labels["dpd"] = (labels["payment_date"] - labels["due_date"]).dt.days
labels["dpd"] = labels["dpd"].clip(lower=0)
labels["dpd"].describe()


count    180.000000
mean      13.477778
std       26.108167
min        0.000000
25%        0.000000
50%        0.000000
75%       20.000000
max      119.000000
Name: dpd, dtype: float64

DPD definition:
DPD = max(0, payment_date − due_date)

In [3]:
labels = cycles.merge(
    payments[["billing_cycle_id", "payment_date"]],
    on="billing_cycle_id",
    how="left"
)

labels["dpd"] = (labels["payment_date"] - labels["due_date"]).dt.days
labels["dpd"] = labels["dpd"].clip(lower=0)
labels["dpd"].describe()


count    180.000000
mean      13.477778
std       26.108167
min        0.000000
25%        0.000000
50%        0.000000
75%       20.000000
max      119.000000
Name: dpd, dtype: float64

Create multi-horizon labels

In [4]:
labels["late_30"] = (labels["dpd"] >= 30).astype(int)
labels["late_60"] = (labels["dpd"] >= 60).astype(int)
labels["late_90"] = (labels["dpd"] >= 90).astype(int)
labels[["late_30", "late_60", "late_90"]].mean()


late_30    0.183333
late_60    0.072222
late_90    0.038889
dtype: float64

Validate temporal integrity

In [5]:
(labels["payment_date"] >= labels["due_date"]).all()


np.False_

Save labels for modeling

In [6]:
labels_out = labels[[
    "billing_cycle_id", "dpd", "late_30", "late_60", "late_90"
]]

labels_out.to_csv("../data/processed/labels.csv", index=False)


In [7]:
features = ts_data[[
    "billing_cycle_id",
    "credit_utilization",
    "min_due_ratio",
    "monthly_spend",
    "spend_volatility",
    "util_roll_3",
    "spend_roll_3",
    "spend_shock"
]].copy()

features.to_csv("../data/processed/features.csv", index=False)


NameError: name 'ts_data' is not defined