# OpenFinGuard — Class Imbalance Handling

Week: 5  
Day: 2  
Objective:
- Handle rare-event prediction responsibly
- Preserve time-based realism
- Compare imbalance strategies


In [5]:
modeling = pd.read_csv(
    "../data/processed/modeling_table.csv",
    parse_dates=["cycle_end_date"]
)


In [6]:
modeling.columns


Index(['billing_cycle_id', 'credit_utilization', 'min_due_ratio',
       'monthly_spend', 'spend_volatility', 'util_roll_3', 'spend_roll_3',
       'spend_shock', 'dpd', 'late_30', 'late_60', 'late_90', 'cycle_end_date',
       'payment_stress', 'spend_to_limit'],
      dtype='str')

In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

modeling = pd.read_csv(
    "../data/processed/modeling_table.csv",
    parse_dates=["cycle_end_date"]
)

cutoff_date = modeling["cycle_end_date"].quantile(0.75)

train = modeling[modeling["cycle_end_date"] <= cutoff_date]
valid = modeling[modeling["cycle_end_date"] > cutoff_date]

X_train = train[[
    "credit_utilization",
    "min_due_ratio",
    "monthly_spend",
    "spend_volatility",
    "util_roll_3",
    "spend_roll_3",
    "spend_shock",
    "payment_stress",
    "spend_to_limit"
]]

y_train = train["late_30"]
X_valid = valid[X_train.columns]
y_valid = valid["late_30"]


Baseline without imbalance handling

In [8]:
logit_base = LogisticRegression(max_iter=1000)
logit_base.fit(X_train, y_train)

base_auc = roc_auc_score(
    y_valid,
    logit_base.predict_proba(X_valid)[:, 1]
)

base_auc


0.5486111111111112

Strategy 1: Class weights

In [9]:
logit_weighted = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

logit_weighted.fit(X_train, y_train)

weighted_auc = roc_auc_score(
    y_valid,
    logit_weighted.predict_proba(X_valid)[:, 1]
)

weighted_auc


0.5347222222222222

Strategy 2: Threshold tuning (BUSINESS tool)

In [10]:
probs_valid = logit_base.predict_proba(X_valid)[:, 1]


In [11]:
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]

for t in thresholds:
    preds = (probs_valid >= t).astype(int)
    print(f"Threshold {t}: Positive rate = {preds.mean():.2f}")


Threshold 0.1: Positive rate = 1.00
Threshold 0.2: Positive rate = 0.13
Threshold 0.3: Positive rate = 0.00
Threshold 0.4: Positive rate = 0.00
Threshold 0.5: Positive rate = 0.00


Precision vs Recall intuition
Lower threshold → more recall (catch risk)

Higher threshold → more precision (fewer false alarms)