In [17]:
from sklearn.metrics import roc_auc_score
import numpy as np 
import pandas as pd 
import statsmodels.api as sm

df = pd.read_csv('gen_data.csv')
import pickle

ALL_WOE_MODELS = pickle.load(open("ALL_WOE_MODELS.pkl", "rb"))
ALL_FINAL_VARS = pickle.load(open("ALL_FINAL_VARS.pkl", "rb"))
ALL_LR_MODELS   = pickle.load(open("ALL_LR_MODELS.pkl", "rb"))


df['LR_PROB'] = np.nan

segments = ["1. Rich/Secured", "2. Prime Unsecured", "3. Mass Unsecured"]


for seg in segments:
    print(f"Scoring segment: {seg}")
    # Lấy mask để đảm bảo dùng chung vị trí
    mask = (df['SEGMENT'] == seg)
    df_seg = df[mask].copy()

    woe_dict = ALL_WOE_MODELS[seg]
    var_list = ALL_FINAL_VARS[seg]
    lr_model = ALL_LR_MODELS[seg]

    X_woe = pd.DataFrame(index=df_seg.index) # Khởi tạo kèm index gốc
    for feature in var_list:
        X_woe[feature] = woe_dict[feature].transform(df_seg[feature], metric="woe")

    X_const = sm.add_constant(X_woe)
    
    # Dùng .values ở đây để tránh lệch index
    df.loc[mask, 'LR_PROB'] = lr_model.predict(X_const).values

# Calculate overall Gini
y_true = df['BAD_NEXT_12M']
y_prob = df['LR_PROB']

auc = roc_auc_score(y_true, y_prob)
gini = 2*auc - 1

print(f"\n>>> GINI TOÀN BỘ (3 segment LR gộp lại): {gini:.4f}")

Scoring segment: 1. Rich/Secured
Scoring segment: 2. Prime Unsecured
Scoring segment: 3. Mass Unsecured

>>> GINI TOÀN BỘ (3 segment LR gộp lại): 0.8735


In [18]:
df['SEGMENT'].unique()

array(['3. Mass Unsecured', '2. Prime Unsecured', '1. Rich/Secured'],
      dtype=object)

In [19]:
print("WOE keys:", ALL_WOE_MODELS.keys())

WOE keys: dict_keys(['2. Prime Unsecured', '1. Rich/Secured', '3. Mass Unsecured'])


In [20]:
df.groupby("SEGMENT")['LR_PROB'].apply(lambda x: x.isna().mean())


SEGMENT
1. Rich/Secured       0.0
2. Prime Unsecured    0.0
3. Mass Unsecured     0.0
Name: LR_PROB, dtype: float64

In [21]:
X_woe.isna().sum()

MAX_DPD_12M_OBS      0
N_AVG_DEPOSIT_12M    0
CBAL_TO_INC_12MON    0
dtype: int64

In [22]:
print("SEGMENT counts:\n", df['SEGMENT'].value_counts())
print("\nNaN ratio by segment:\n", df.groupby("SEGMENT")['LR_PROB'].apply(lambda x: x.isna().mean()))
print("\nExample NaN rows:\n", df[df['LR_PROB'].isna()].head())


SEGMENT counts:
 SEGMENT
3. Mass Unsecured     437372
1. Rich/Secured       421292
2. Prime Unsecured    279495
Name: count, dtype: int64



NaN ratio by segment:
 SEGMENT
1. Rich/Secured       0.0
2. Prime Unsecured    0.0
3. Mass Unsecured     0.0
Name: LR_PROB, dtype: float64

Example NaN rows:
 Empty DataFrame
Columns: [SOCIF, C_GIOITINH, TRINHDO, TTHONNHAN, SOHUUNHA, NHANVIENBIDV, INHERENT_RISK, REF_MONTH, REF_DAY, year, BASE_AUM, CURRENT_RISK, TUOI, SNAPSHOT_DATE, INCOME, CBAL, CBALORG, AFLIMT_MAX, AFLIMT_MIN, AFLIMT_AVG, CBAL_AVG, CBAL_MAX, CBAL_MIN, COLLATERAL_VALUE, LTV, N_AVG_DEPOSIT_12M, N_AVG_DEPOSIT_6M, N_AVG_DD_12M, N_AVG_CD_12M, FLAG_SALARY_ACC, FLAG_DEPOSIT, CBAL_SHORTTERM_LOAN, CBAL_LONGTERM_LOAN, HAS_SHORTTERM_LOAN, HAS_LONGTERM_LOAN, DURATION_MAX, REMAINING_DURATION_MAX, TIME_TO_OP_MAX, RATE_AVG, PURCOD_MAX, PURCOD_MIN, MAX_DPD_12M, MAX_DPD_12M_OBS, AVG_OD_DPD_12M, SUM_ALL_OD_12M, BAD_CURRENT, XULYNO, MAX_NHOMNOCIC, N_AVG_OVERDUE_CBAL_12M, CBAL_TO_INC_12MON, REAL_GDP_GROWTH_12M, BAD_NEXT_12M, SAMPLE_TYPE, SEGMENT, LR_PROB]
Index: []

[0 rows x 55 columns]
