In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np 
import pandas as pd 
import statsmodels.api as sm

df = pd.read_csv('gen_data.csv')
import pickle

ALL_WOE_MODELS = pickle.load(open("ALL_WOE_MODELS.pkl", "rb"))
ALL_FINAL_VARS = pickle.load(open("ALL_FINAL_VARS.pkl", "rb"))
ALL_LR_MODELS   = pickle.load(open("ALL_LR_MODELS.pkl", "rb"))


df['LR_PROB'] = np.nan

segments = ["1. Rich/Secured", "2. Prime Unsecured", "3. Mass Unsecured"]


for seg in segments:
    print(f"Scoring segment: {seg}")
    df_seg = df[df['SEGMENT'] == seg].copy()

    woe_dict = ALL_WOE_MODELS[seg]
    var_list = ALL_FINAL_VARS[seg]
    lr_model = ALL_LR_MODELS[seg]

    # Transform WOE
    X_woe = pd.DataFrame()
    for feature in var_list:
        X_woe[feature] = woe_dict[feature].transform(df_seg[feature], metric="woe")

    # Add constant
    X_const = sm.add_constant(X_woe)

    # Predict
    df.loc[df['SEGMENT']==seg, 'LR_PROB'] = lr_model.predict(X_const)

# Calculate overall Gini
y_true = df['BAD_NEXT_12M']
y_prob = df['LR_PROB']

auc = roc_auc_score(y_true, y_prob)
gini = 2*auc - 1

print(f"\n>>> GINI TOÀN BỘ (3 segment LR gộp lại): {gini:.4f}")

Scoring segment: 1. Rich/Secured
Scoring segment: 2. Prime Unsecured
Scoring segment: 3. Mass Unsecured


ValueError: Input contains NaN.

In [None]:
df['SEGMENT'].unique()

array(['3. Mass Unsecured', '2. Prime Unsecured', '1. Rich/Secured'],
      dtype=object)

In [None]:
print("WOE keys:", ALL_WOE_MODELS.keys())

WOE keys: dict_keys(['2. Prime Unsecured', '1. Rich/Secured'])


In [14]:
df.groupby("SEGMENT")['LR_PROB'].apply(lambda x: x.isna().mean())


SEGMENT
1. Rich/Secured       0.630582
2. Prime Unsecured    0.755556
3. Mass Unsecured     0.615332
Name: LR_PROB, dtype: float64

In [15]:
X_woe.isna().sum()

MAX_DPD_12M_OBS      0
N_AVG_DEPOSIT_12M    0
CBAL_TO_INC_12MON    0
dtype: int64

In [16]:
print("SEGMENT counts:\n", df['SEGMENT'].value_counts())
print("\nNaN ratio by segment:\n", df.groupby("SEGMENT")['LR_PROB'].apply(lambda x: x.isna().mean()))
print("\nExample NaN rows:\n", df[df['LR_PROB'].isna()].head())


SEGMENT counts:
 SEGMENT
3. Mass Unsecured     437372
1. Rich/Secured       421292
2. Prime Unsecured    279495
Name: count, dtype: int64

NaN ratio by segment:
 SEGMENT
1. Rich/Secured       0.630582
2. Prime Unsecured    0.755556
3. Mass Unsecured     0.615332
Name: LR_PROB, dtype: float64

Example NaN rows:
           SOCIF C_GIOITINH  TRINHDO TTHONNHAN  SOHUUNHA  NHANVIENBIDV  \
279497  1098532          O        3    Single         0             0   
279498  1098532          O        3    Single         0             0   
279499  1098532          O        3    Single         0             0   
279500  1098532          O        3    Single         0             0   
279502  1098532          O        3    Single         0             0   

        INHERENT_RISK  REF_MONTH  REF_DAY  year  ...  BAD_CURRENT  XULYNO  \
279497      -0.438063          7       28  2018  ...            0       0   
279498      -0.438063          7       28  2019  ...            0       0   
279499      -0.43