In [33]:
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from IPython.display import display
# from xgboost import XGBClassifier

<span style="font-size:30px; font-family:'Times new roman'; font-weight:bold">
Import Data🎒
</span>

<span style="font-size:15px; font-family:'Times new roman'">
When we build a PD model under an IFRS 9 framework -->
Interpretability and regulatory acceptance<br>
- Logistic regression is the industry standard for PD because its coefficients map directly to scorecard <br>points or log-odds, making it easy to explain to auditors and regulators<br>
- It lets you incorporate Weight-of-Evidence (WOE) transformations and track monotonic relationships
<span>

In [34]:
df = pd.read_csv("../../data/toModel.csv")

In [35]:
df.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,...,INSTM_PAYMENT_DIFF_MEAN_woe,INSTM_PAYMENT_DIFF_MIN_woe,INSTM_PAYMENT_DIFF_MAX_woe,INSTM_PAYMENT_RATIO_STD_woe,INSTM_IS_LATE_SUM_woe,INSTM_IS_EARLY_SUM_woe,INSTM_IS_OVERPAY_SUM_woe,INSTM_IS_MISSED_SUM_woe,SK_ID_CURR,TARGET
0,0.036202,0.250941,0.014724,0.188771,0.111466,0.213833,-0.037978,0.298054,0.011871,0.154966,...,0.050639,0.036775,-0.044722,0.12792,0.189795,-0.063673,-0.046019,-0.01631,100002,1
1,0.036202,-0.154315,-0.080369,-0.362672,-0.439432,-0.071234,-0.037978,-0.26593,-0.043154,-0.332558,...,0.050639,0.036775,-0.044722,0.12792,0.189795,0.035346,-0.046019,-0.01631,100003,0
2,-0.41503,0.250941,0.014724,0.188771,0.111466,0.213833,-0.037978,0.298054,-0.043154,-0.156375,...,0.050639,0.036775,-0.044722,0.12792,0.189795,-0.173362,-0.046019,-0.01631,100004,0
3,0.036202,-0.154315,0.014724,0.188771,0.111466,0.229315,-0.037978,0.298054,0.011871,0.154966,...,0.050639,0.036775,-0.044722,0.12792,0.189795,-0.063673,-0.046019,-0.01631,100006,0
4,0.036202,0.250941,0.014724,0.188771,0.111466,0.213833,-0.037978,-0.26593,0.003832,-0.170278,...,-0.180189,-0.159709,-0.044722,-0.115678,-0.203037,0.204516,-0.046019,-0.01631,100007,0


<span style="font-size:30px; font-family:'Times new roman'; font-weight:bold">
Logistic Regression🪵
</span>

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import train_test_split

In [37]:
X = df.drop(columns=['TARGET', 'SK_ID_CURR'])
idSeries = df['SK_ID_CURR']
y = df['TARGET']

In [38]:
model = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=0),
    StandardScaler(), 
    LogisticRegression(
        # I don't do feature selection, so l2 are more appropriate to predict
        # keep all features but just constrain
        penalty='l2',
        solver='liblinear',
        # handle class imbalance --> TARGET are very imbalance
        class_weight='balanced'
        )
    )
model.fit(X, y)

0,1,2
,steps,"[('simpleimputer', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'liblinear'
,max_iter,100


In [39]:
# evaluate with CV
score = cross_val_score(model, X, y, cv=8, scoring='roc_auc')
print(f"8 folds AUC: {score.mean():.4f} ± {score.std():.4f}")

8 folds AUC: 0.7492 ± 0.0035


In [43]:
# pipeline does has arrtibute .coef, therefore, I need to get a step inthe model TwT
coef = pd.Series(model.named_steps['logisticregression'].coef_[0], index=X.columns)
intercept = model.named_steps['logisticregression'].intercept_[0]

In [44]:
print("Intercept:", intercept)
print("5 top highest coeff --> top default driver:")
print(coef.sort_values(ascending=False).head())
print("most negative coeff --> top protective driver:")
print(coef.sort_values().head())

Intercept: -0.3725063532022945
5 top highest coeff --> top default driver:
CODE_GENDER            0.164273
FLAG_DOCUMENT_3        0.161665
ORGANIZATION_TYPE      0.143422
NAME_EDUCATION_TYPE    0.138257
FLAG_OWN_CAR           0.113104
dtype: float64
most negative coeff --> top protective driver:
EXT_SOURCE_2_woe                    -0.438022
EXT_SOURCE_3_woe                    -0.377364
BUREAU_DEBT_CREDIT_RATIO_MEAN_woe   -0.192195
INSTM_IS_EARLY_SUM_woe              -0.162840
DAYS_EMPLOYED_woe                   -0.125267
dtype: float64


<span style="font-size:15px; font-family:'Times new roman'">
Puff 😮‍💨 the intercept is negative number --> still normal 
<span>

In [45]:
# finally, prdict PD our first goal
df['PD'] = model.predict_proba(X)[:, 1]

<span style="font-size:15px; font-family:'Times new roman'">
How to cal PD --> logis tranform log-odds(model.dicision_function(x))<br>
these processes are sklearn predict_proba
<span>

In [46]:
df['PD'] # 🥹

0         0.851061
1         0.157919
2         0.223817
3         0.418342
4         0.540705
            ...   
307506    0.513636
307507    0.685561
307508    0.474071
307509    0.259091
307510    0.533619
Name: PD, Length: 307511, dtype: float64

<span style="font-size:30px; font-family:'Times new roman'; font-weight:bold">
Hyperparameter Tuning🪵
</span>

In [47]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from scipy.stats import uniform

In [48]:
# for time efficiency let's do random search
paramDist = {
    'logisticregression__C': uniform(loc=0.01, scale=100),
    'logisticregression__penalty': ['l1', 'l2']
}
cv = StratifiedKFold(n_splits=8, shuffle=True)

In [49]:
randomSearch = RandomizedSearchCV(
    model,
    param_distributions=paramDist,
    n_iter=20,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
)
randomSearch.fit(X, y)

print("Best AUC:", randomSearch.best_score_)
print("Best Params:", randomSearch.best_params_)

Best AUC: 0.7494096137022238
Best Params: {'logisticregression__C': np.float64(99.59683055695973), 'logisticregression__penalty': 'l1'}


In [52]:
bestModelScore = randomSearch.best_score_
bestModelEstimator = randomSearch.best_estimator_

In [53]:
coef = pd.Series(bestModelEstimator.named_steps['logisticregression'].coef_[0], index=X.columns)
intercept = bestModelEstimator.named_steps['logisticregression'].intercept_[0]
print("Best AUC:", randomSearch.best_score_)
print("Best Params:", randomSearch.best_params_)

Best AUC: 0.7494096137022238
Best Params: {'logisticregression__C': np.float64(99.59683055695973), 'logisticregression__penalty': 'l1'}


In [54]:
print("Intercept:", intercept)
print("5 top highest coeff --> top default driver:")
print(coef.sort_values(ascending=False).head())
print("most negative coeff --> top protective driver:")
print(coef.sort_values().head())

Intercept: -0.3725160978386604
5 top highest coeff --> top default driver:
CODE_GENDER            0.164277
FLAG_DOCUMENT_3        0.161722
ORGANIZATION_TYPE      0.143428
NAME_EDUCATION_TYPE    0.138259
FLAG_OWN_CAR           0.113107
dtype: float64
most negative coeff --> top protective driver:
EXT_SOURCE_2_woe                    -0.438026
EXT_SOURCE_3_woe                    -0.377370
BUREAU_DEBT_CREDIT_RATIO_MEAN_woe   -0.192201
INSTM_IS_EARLY_SUM_woe              -0.162845
DAYS_EMPLOYED_woe                   -0.125267
dtype: float64


In [55]:
df['PD'] = bestModelEstimator.predict_proba(X)[:, 1]

In [56]:
df.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,...,INSTM_PAYMENT_DIFF_MIN_woe,INSTM_PAYMENT_DIFF_MAX_woe,INSTM_PAYMENT_RATIO_STD_woe,INSTM_IS_LATE_SUM_woe,INSTM_IS_EARLY_SUM_woe,INSTM_IS_OVERPAY_SUM_woe,INSTM_IS_MISSED_SUM_woe,SK_ID_CURR,TARGET,PD
0,0.036202,0.250941,0.014724,0.188771,0.111466,0.213833,-0.037978,0.298054,0.011871,0.154966,...,0.036775,-0.044722,0.12792,0.189795,-0.063673,-0.046019,-0.01631,100002,1,0.851064
1,0.036202,-0.154315,-0.080369,-0.362672,-0.439432,-0.071234,-0.037978,-0.26593,-0.043154,-0.332558,...,0.036775,-0.044722,0.12792,0.189795,0.035346,-0.046019,-0.01631,100003,0,0.157916
2,-0.41503,0.250941,0.014724,0.188771,0.111466,0.213833,-0.037978,0.298054,-0.043154,-0.156375,...,0.036775,-0.044722,0.12792,0.189795,-0.173362,-0.046019,-0.01631,100004,0,0.223813
3,0.036202,-0.154315,0.014724,0.188771,0.111466,0.229315,-0.037978,0.298054,0.011871,0.154966,...,0.036775,-0.044722,0.12792,0.189795,-0.063673,-0.046019,-0.01631,100006,0,0.418342
4,0.036202,0.250941,0.014724,0.188771,0.111466,0.213833,-0.037978,-0.26593,0.003832,-0.170278,...,-0.159709,-0.044722,-0.115678,-0.203037,0.204516,-0.046019,-0.01631,100007,0,0.540706


<span style="font-size:30px; font-family:'Times new roman'; font-weight:bold">
Tranfer Data to PostModel process
</span>

In [57]:
import joblib
destDir = "../artifacts"

In [58]:
os.makedirs(destDir, exist_ok=True)
joblib.dump(bestModelEstimator, os.path.join(destDir, 'bestModel.pkl'))
joblib.dump(df, os.path.join(destDir, 'data.pkl'))
joblib.dump(coef, os.path.join(destDir, 'coef.pkl'))
joblib.dump(intercept, os.path.join(destDir, 'intercept.pkl'))

['../artifacts\\intercept.pkl']

<span style="font-size:30px; font-family:'Times new roman'; font-weight:bold">
Logistic Regression without WOE --> to find AUC area but don't use in My model anymore🪵
</span>

In [None]:
# x = df.drop(columns=["TARGET",'SK_ID_CURR'])
# y = df['TARGET']

In [None]:
# # split train val test
# xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=745, stratify=y)

In [None]:
# # replace ±inf with NaN 
# xTrain = xTrain.replace([np.inf, -np.inf], np.nan)
# xTest  = xTest.replace([np.inf, -np.inf], np.nan)

# # (optional) see which columns were affected
# inf = xTrain.columns[xTrain.isna().any()]
# print("Columns with inf/NaN:", inf.tolist())

Columns with inf/NaN: ['AMT_ANNUITY', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'BUREAU_DAYS_CREDIT_MIN', 'BUREAU_DAYS_CREDIT_MAX', 'BUREAU_DAYS_CREDIT_MEAN', 'BUREAU_DAYS_CREDIT_ENDDATE_MEAN', 'BUREAU_AMT_CREDIT_SUM_SUM', 'BUREAU_AMT_CREDIT_SUM_MEAN', 'BUREAU_AMT_CREDIT_SUM_DEBT_SUM', 'BUREAU_AMT_CREDIT_SUM_DEBT_MEAN', 'BUREAU_AMT_CREDIT_SUM_OVERDUE_SUM', 'BUREAU_AMT_CREDIT_SUM_OVERDUE_MEAN', 'BUREAU_DEBT_CREDIT_RATIO_MEAN', 'BUREAU_IS_ACTIVE_SUM', 'BUREAU_IS_CLOSED_SUM', 'BUREAU_IS_SOLD_SUM', 'BUREAU_IS_BAD_DEBT_SUM', 'BUREAU_IS_REVOLVING_SUM', 'BUREAU_IS_CONSUMER_SUM', 'BUREAU_IS_MORTGAGE_SUM', 'BUREAU_BB_MONTHS_BALANCE_COUNT_SUM', 'BUREAU_BB_IS_LATE_SUM_SUM', 'BUREAU_BB_IS_DPD_30_SUM_SUM', 'BUREAU_BB_I

In [None]:
# # Pipeline: scaling → logistic regression with L2 penalty
# lrPipeline = make_pipeline(
#     SimpleImputer(strategy='constant', fill_value=0),
#     # to unit variance --> same scale
#     StandardScaler(),
#     # classifier
#     LogisticRegression(
#         # I don't do feature selection, so l2 are more appropriate to predict
#         # keep all features but just constrain
#         penalty='l2',
#         solver='liblinear',
#         # handle class imbalance --> TARGET are very imbalance
#         class_weight='balanced',  
#         random_state=474
#     )
# )
# lrPipeline.fit(xTrain, yTrain)

0,1,2
,steps,"[('simpleimputer', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,474
,solver,'liblinear'
,max_iter,100


In [None]:
# # Predict & evaluate
# yPred = lrPipeline.predict_proba(xTest)[:, 1]
# print("Logistic AUC:", roc_auc_score(yTest, yPred))
# print(classification_report(yTest, lrPipeline.predict(xTest)))

Logistic AUC: 0.7404078968428651
              precision    recall  f1-score   support

           0       0.96      0.68      0.79     56538
           1       0.15      0.67      0.25      4965

    accuracy                           0.68     61503
   macro avg       0.56      0.67      0.52     61503
weighted avg       0.89      0.68      0.75     61503



<span style="font-size:30px; font-family:'Times new roman'; font-weight:bold">
Random Forest🪵
</span>

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import StratifiedKFold, cross_val_score

In [None]:
# randomForestPipe = make_pipeline(
#     SimpleImputer(strategy='constant', fill_value=0),
#     StandardScaler(),
#     RandomForestClassifier(
#         n_estimators=200,
#         max_depth=8,
#         class_weight='balanced',
#         random_state=475,
#         n_jobs=-1
#     )
# )
# randomForestPipe.fit(xTrain, yTrain)

0,1,2
,steps,"[('simpleimputer', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,8
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# yPred2 = randomForestPipe.predict_proba(xTest)[:, 1]
# print("Random Forest Test AUC: %.4f" % roc_auc_score(yTest, yPred2))
# print("\n▶ Classification report 0.5 threshold:\n", 
#       classification_report(yTest, (yPred2 >= 0.5).astype(int)))

Random Forest Test AUC: 0.7327

▶ Classification report 0.5 threshold:
               precision    recall  f1-score   support

           0       0.96      0.70      0.81     56538
           1       0.16      0.64      0.25      4965

    accuracy                           0.70     61503
   macro avg       0.56      0.67      0.53     61503
weighted avg       0.89      0.70      0.76     61503



In [None]:
# x = x.replace([np.inf, -np.inf], np.nan)

<span style="font-size:20px; font-family:'Times new roman'; font-weight:bold">
Try k-fold cross --> 8 folds
</span>

In [None]:
# cv = StratifiedKFold(8, shuffle=True, random_state=42)
# cv_scores = cross_val_score(randomForestPipe, x, y, cv=cv, scoring='roc_auc')
# print("5-fold CV AUC: %.4f ± %.4f" 
#       % (cv_scores.mean(), cv_scores.std()))

5-fold CV AUC: 0.7388 ± 0.0034
