In [1]:
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('gen_data.csv')

In [3]:
segmentation_candidates = [
    # --- NH√ìM 1: QUY M√î & T√ÄI S·∫¢N (Wealth & Scale) - "Gi√†u hay Ngh√®o?" ---
    'BASE_AUM',             # T·ªïng t√†i s·∫£n (Quan tr·ªçng nh·∫•t)
    'INCOME',               # Thu nh·∫≠p
    'COLLATERAL_VALUE',     # Gi√° tr·ªã TSBƒê (Ph√¢n bi·ªát vay th·∫ø ch·∫•p/t√≠n ch·∫•p)
    'SOHUUNHA',             # C√≥ nh√† hay kh√¥ng
    
    # --- NH√ìM 2: QUY M√î T√çN D·ª§NG (Exposure) - "Vay nhi·ªÅu hay √≠t?" ---
    'AFLIMT_MAX',           # H·∫°n m·ª©c (S·ª©c ch·ª©a)
    'CBAL',                 # D∆∞ n·ª£ hi·ªán t·∫°i
    'CBAL_MAX',             # ƒê·ªânh n·ª£
    'LTV',
    
    # --- NH√ìM 3: S·∫¢N PH·∫®M & M·ª§C ƒê√çCH (Product) - "Vay ƒë·ªÉ l√†m g√¨?" ---
    # (M·ªöI B·ªî SUNG - R·∫•t quan tr·ªçng ƒë·ªÉ t√°ch Portfolio)
    'PURCOD_MAX',           # M√£ m·ª•c ƒë√≠ch vay (BƒêS, Ti√™u d√πng, SXKD...)
    'HAS_SHORTTERM_LOAN',   # Ch·ªâ vay ng·∫Øn h·∫°n?
    'HAS_LONGTERM_LOAN',    # C√≥ vay d√†i h·∫°n?
    'DURATION_MAX',         # K·ª≥ h·∫°n vay (Ng·∫Øn/Trung/D√†i)
    
    # --- NH√ìM 4: THANH KHO·∫¢N (Liquidity) - "C√≥ ti·ªÅn m·∫∑t kh√¥ng?" ---
    'N_AVG_DEPOSIT_12M',    # Ti·ªÅn g·ª≠i b√¨nh qu√¢n
    'FLAG_SALARY_ACC',      # C√≥ tr·∫£ l∆∞∆°ng qua ƒë√¢y kh√¥ng?
    'FLAG_DEPOSIT',         # C√≥ t√†i kho·∫£n ti·ªÅn g·ª≠i kh√¥ng?
    
    # --- NH√ìM 5: NH√ÇN KH·∫®U (Demographics) - "Ai?" ---
    'TUOI',                 # ƒê·ªô tu·ªïi (Ng∆∞·ªùi tr·∫ª r·ªßi ro kh√°c ng∆∞·ªùi gi√†)
    'TRINHDO',              # H·ªçc v·∫•n
    'TTHONNHAN'             # H√¥n nh√¢n
]

X = df[segmentation_candidates].copy()
y = df['BAD_NEXT_12M'].copy()

# X·ª≠ l√Ω bi·∫øn ph√¢n lo·∫°i (Categorical)
# T·ª± ƒë·ªông t√¨m c√°c c·ªôt d·∫°ng chu·ªói (Object) ho·∫∑c Category
categorical_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns

for col in categorical_cols:
    # B∆∞·ªõc 1: ƒêi·ªÅn gi√° tr·ªã thi·∫øu cho bi·∫øn ch·ªØ tr∆∞·ªõc (n·∫øu c√≥)
    X[col] = X[col].fillna("Unknown")
    
    # B∆∞·ªõc 2: Chuy·ªÉn sang d·∫°ng s·ªë (0, 1, 2...)
    # V√≠ d·ª•: Single -> 0, Married -> 1
    X[col] = pd.factorize(X[col])[0]
    print(f" -> ƒê√£ m√£ h√≥a bi·∫øn: {col}")
    
# ƒêi·ªÅn gi√° tr·ªã 0 v√†o c√°c √¥ tr·ªëng (ho·∫∑c d√πng trung b√¨nh t√πy nghi·ªáp v·ª•)
imputer = SimpleImputer(strategy='constant', fill_value=0)                       # ko c√≥ null

X_filled = imputer.fit_transform(X)

# 4. C·∫•u h√¨nh C√¢y quy·∫øt ƒë·ªãnh (Theo ƒë√∫ng B√°o c√°o BIDV)
# criterion='gini', class_weight='balanced'
dt_model = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    class_weight='balanced',
    random_state=42,
    max_depth=5  # Gi·ªõi h·∫°n ƒë·ªô s√¢u ƒë·ªÉ tr√°nh overfitting khi t√≠nh ƒë·ªô quan tr·ªçng
)

# Hu·∫•n luy·ªán m√¥ h√¨nh
dt_model.fit(X_filled, y)

# 5. Tr√≠ch xu·∫•t Feature Importance
importance_df = pd.DataFrame({
    'T√™n Bi·∫øn': segmentation_candidates,
    'ƒê·ªô Quan Tr·ªçng (Feature Importance)': dt_model.feature_importances_
})

# S·∫Øp x·∫øp t·ª´ cao xu·ªëng th·∫•p
importance_df = importance_df.sort_values(by='ƒê·ªô Quan Tr·ªçng (Feature Importance)', ascending=False)

print("--- K·∫æT QU·∫¢ X·∫æP H·∫†NG ƒê·ªò QUAN TR·ªåNG C·ª¶A BI·∫æN ---")
print(importance_df)

 -> ƒê√£ m√£ h√≥a bi·∫øn: TTHONNHAN
--- K·∫æT QU·∫¢ X·∫æP H·∫†NG ƒê·ªò QUAN TR·ªåNG C·ª¶A BI·∫æN ---
              T√™n Bi·∫øn  ƒê·ªô Quan Tr·ªçng (Feature Importance)
2     COLLATERAL_VALUE                            0.651133
12   N_AVG_DEPOSIT_12M                            0.312930
7                  LTV                            0.023635
5                 CBAL                            0.008419
0             BASE_AUM                            0.002894
6             CBAL_MAX                            0.000989
11        DURATION_MAX                            0.000000
16             TRINHDO                            0.000000
15                TUOI                            0.000000
14        FLAG_DEPOSIT                            0.000000
13     FLAG_SALARY_ACC                            0.000000
9   HAS_SHORTTERM_LOAN                            0.000000
10   HAS_LONGTERM_LOAN                            0.000000
1               INCOME                            0.000000
8     

In [4]:
# from sklearn.tree import plot_tree
# import matplotlib.pyplot as plt

# # C·∫•u h√¨nh l·∫°i c√¢y n√¥ng h∆°n (max_depth=3) ƒë·ªÉ d·ªÖ nh√¨n bi·ªÉu ƒë·ªì
# dt_viz = DecisionTreeClassifier(
#     criterion='gini', 
#     max_depth=3,            # Ch·ªâ v·∫Ω 3 t·∫ßng
#     min_samples_leaf=0.05, 
#     class_weight='balanced',
#     random_state=42
# )
# dt_viz.fit(X_filled, y)

# # V·∫Ω h√¨nh
# plt.figure(figsize=(25, 12))
# plot_tree(
#     dt_viz, 
#     feature_names=segmentation_candidates, 
#     class_names=['Good', 'Bad'],
#     filled=True, 
#     rounded=True, 
#     fontsize=12,
#     impurity=False, # ·∫®n ch·ªâ s·ªë Gini cho ƒë·ª° r·ªëi
#     proportion=True # Hi·ªÉn th·ªã t·ª∑ l·ªá %
# )
# plt.title("S∆† ƒê·ªí PH√ÇN KH√öC KH√ÅCH H√ÄNG (D·ª±a tr√™n Collateral & Deposit)")
# plt.show()

In [5]:
# ==========================================
# 11. T√åM NG∆Ø·ª†NG C·∫ÆT T·ªêI ∆ØU (OPTIMAL BINNING)
# ==========================================
from optbinning import OptimalBinning
import pandas as pd

print("\n=== T√åM NG∆Ø·ª†NG C·∫ÆT T·ªêI ∆ØU (DATA-DRIVEN THRESHOLDS) ===")

# H√†m h·ªó tr·ª£ t√¨m ƒëi·ªÉm c·∫Øt
def find_optimal_cutoff(df, feature, target):
    # C·∫•u h√¨nh Optimal Binning
    # max_n_bins=2 -> Ch·ªâ chia l√†m 2 ph·∫ßn (<= Ng∆∞·ª°ng v√† > Ng∆∞·ª°ng)
    optb = OptimalBinning(name=feature, dtype="numerical", solver="cp", max_n_bins=2)
    
    # Fit d·ªØ li·ªáu
    optb.fit(df[feature], df[target])
    
    # L·∫•y b·∫£ng k·∫øt qu·∫£ binning
    binning_table = optb.binning_table.build()
    
    # L·∫•y ƒëi·ªÉm c·∫Øt (Split point)
    # Th∆∞·ªùng n√≥ n·∫±m ·ªü d√≤ng ƒë·∫ßu ti√™n c·ªôt 'Bin' (d·∫°ng [-inf, 10000.0))
    splits = optb.splits
    
    print(f"\n--- Ph√¢n t√≠ch bi·∫øn: {feature} ---")
    if len(splits) > 0:
        cutoff = splits[0]
        print(f"-> ƒêi·ªÉm c·∫Øt t·ªëi ∆∞u (Math): {cutoff:,.0f}")
        print(f"-> G·ª£i √Ω l√†m tr√≤n (Business): {round(cutoff, -6):,.0f}") # L√†m tr√≤n ƒë·∫øn h√†ng tri·ªáu
        print("Chi ti·∫øt chia nh√≥m:")
        display(binning_table[['Bin', 'Count', 'Event', 'Non-event', 'Event rate', 'IV']])
        return cutoff
    else:
        print("-> Kh√¥ng t√¨m th·∫•y ƒëi·ªÉm c·∫Øt t·ªëi ∆∞u (Bi·∫øn n√†y kh√¥ng ph√¢n t√°ch ƒë∆∞·ª£c r·ªßi ro).")
        return None

# 1. T√¨m ng∆∞·ª°ng cho T√ÄI S·∫¢N ƒê·∫¢M B·∫¢O (Collateral)
# ƒê·ªÉ t√°ch nh√≥m Secured vs Unsecured
cutoff_collateral = find_optimal_cutoff(df, 'COLLATERAL_VALUE', 'BAD_NEXT_12M')

# 2. T√¨m ng∆∞·ª°ng cho TI·ªÄN G·ª¨I (Deposit)
# ƒê·ªÉ t√°ch nh√≥m VIP vs Mass (Ch·ªâ ch·∫°y tr√™n t·∫≠p KH√îNG c√≥ TSBƒê ƒë·ªÉ chu·∫©n b√†i)
# V√¨ nh√≥m c√≥ TSBƒê ƒë√£ t√°ch ra r·ªìi, ta ch·ªâ quan t√¢m ph√¢n kh√∫c VIP trong nh√≥m T√≠n ch·∫•p
df_unsecured = df[df['COLLATERAL_VALUE'] <= (cutoff_collateral if cutoff_collateral else 0)]
cutoff_deposit = find_optimal_cutoff(df_unsecured, 'N_AVG_DEPOSIT_12M', 'BAD_NEXT_12M')

# ==========================================
# C·∫¨P NH·∫¨T L·∫†I BI·∫æN C·∫ÆT (N·∫øu mu·ªën d√πng s·ªë m√°y t√≠nh)
# ==========================================
# THRESH_COLLATERAL = cutoff_collateral
# THRESH_DEPOSIT = cutoff_deposit


=== T√åM NG∆Ø·ª†NG C·∫ÆT T·ªêI ∆ØU (DATA-DRIVEN THRESHOLDS) ===

--- Ph√¢n t√≠ch bi·∫øn: COLLATERAL_VALUE ---
-> ƒêi·ªÉm c·∫Øt t·ªëi ∆∞u (Math): 36,214,774
-> G·ª£i √Ω l√†m tr√≤n (Business): 36,000,000
Chi ti·∫øt chia nh√≥m:


Unnamed: 0,Bin,Count,Event,Non-event,Event rate,IV
0,"(-inf, 36214774.00)",681516,133157,548359,0.195384,0.23767
1,"[36214774.00, inf)",456643,4563,452080,0.009992,1.094153
2,Special,0,0,0,0.0,0.0
3,Missing,0,0,0,0.0,0.0
Totals,,1138159,137720,1000439,0.121002,1.331822



--- Ph√¢n t√≠ch bi·∫øn: N_AVG_DEPOSIT_12M ---
-> ƒêi·ªÉm c·∫Øt t·ªëi ∆∞u (Math): 3,273,152
-> G·ª£i √Ω l√†m tr√≤n (Business): 3,000,000
Chi ti·∫øt chia nh√≥m:


Unnamed: 0,Bin,Count,Event,Non-event,Event rate,IV
0,"(-inf, 3273152.50)",420200,115541,304659,0.274967,0.139154
1,"[3273152.50, inf)",261316,17616,243700,0.067413,0.378207
2,Special,0,0,0,0.0,0.0
3,Missing,0,0,0,0.0,0.0
Totals,,681516,133157,548359,0.195384,0.517362


In [6]:
# ==========================================
# 12. CH·ªêT PH√ÇN KH√öC (FINAL SEGMENTATION)
# ==========================================
print("\n=== √ÅP D·ª§NG NG∆Ø·ª†NG C·∫ÆT ƒê·ªÇ CHIA 3 PH√ÇN KH√öC ===")

# 1. ƒê·ªãnh nghƒ©a ng∆∞·ª°ng (L√†m tr√≤n t·ª´ k·∫øt qu·∫£ Optimal Binning)
THRESH_COLLATERAL = 2_000_000_000  # 2 T·ª∑ VND
THRESH_DEPOSIT    = 3_500_000      # 3.5 Tri·ªáu VND

print(f"-> Ng∆∞·ª°ng TSBƒê: {THRESH_COLLATERAL:,.0f} VND")
print(f"-> Ng∆∞·ª°ng Ti·ªÅn g·ª≠i: {THRESH_DEPOSIT:,.0f} VND")

# 2. G√°n nh√£n Ph√¢n kh√∫c
conditions = [
    # Nh√≥m 1: RICH / SECURED (C√≥ TSBƒê l·ªõn h∆°n 2 t·ª∑) -> R·ªßi ro c·ª±c th·∫•p (0.6%)
    (df['COLLATERAL_VALUE'] >= THRESH_COLLATERAL),
    
    # Nh√≥m 2: PRIME UNSECURED (Kh√¥ng TSBƒê l·ªõn, nh∆∞ng c√≥ ti·ªÅn g·ª≠i > 3.5tr) -> R·ªßi ro v·ª´a (6%)
    (df['COLLATERAL_VALUE'] < THRESH_COLLATERAL) & (df['N_AVG_DEPOSIT_12M'] >= THRESH_DEPOSIT),
    
    # Nh√≥m 3: SUBPRIME / MASS (Kh√¥ng TSBƒê l·ªõn, ti·ªÅn g·ª≠i √≠t < 3.5tr) -> R·ªßi ro cao (26%)
    (df['COLLATERAL_VALUE'] < THRESH_COLLATERAL) & (df['N_AVG_DEPOSIT_12M'] < THRESH_DEPOSIT)
]

choices = ['1. Rich/Secured', '2. Prime Unsecured', '3. Mass Unsecured']

df['SEGMENT'] = np.select(conditions, choices, default='Unknown')

# 3. Ki·ªÉm tra k·∫øt qu·∫£ l·∫ßn cu·ªëi
print("\n--- HI·ªÜU QU·∫¢ PH√ÇN KH√öC (SEGMENT PERFORMANCE) ---")
summary = df.groupby('SEGMENT').agg({
    'SOCIF': 'count',
    'BAD_NEXT_12M': 'mean',
    'CBAL': 'mean',
    'INCOME': 'mean',
    'COLLATERAL_VALUE': 'mean'
}).rename(columns={
    'SOCIF': 'S·ªë l∆∞·ª£ng KH',
    'BAD_NEXT_12M': 'Bad Rate',
    'CBAL': 'D∆∞ n·ª£ TB',
    'INCOME': 'Thu nh·∫≠p TB',
    'COLLATERAL_VALUE': 'TSBƒê TB'
})

pd.options.display.float_format = '{:,.2f}'.format
print(summary)

# 4. T√°ch DataFrame ƒë·ªÉ chu·∫©n b·ªã cho b∆∞·ªõc Modeling ti·∫øp theo
df_seg1 = df[df['SEGMENT'] == '1. Rich/Secured'].copy()
df_seg2 = df[df['SEGMENT'] == '2. Prime Unsecured'].copy()
df_seg3 = df[df['SEGMENT'] == '3. Mass Unsecured'].copy()


=== √ÅP D·ª§NG NG∆Ø·ª†NG C·∫ÆT ƒê·ªÇ CHIA 3 PH√ÇN KH√öC ===
-> Ng∆∞·ª°ng TSBƒê: 2,000,000,000 VND
-> Ng∆∞·ª°ng Ti·ªÅn g·ª≠i: 3,500,000 VND

--- HI·ªÜU QU·∫¢ PH√ÇN KH√öC (SEGMENT PERFORMANCE) ---
                    S·ªë l∆∞·ª£ng KH  Bad Rate       D∆∞ n·ª£ TB   Thu nh·∫≠p TB  \
SEGMENT                                                                  
1. Rich/Secured          421292      0.01 370,931,597.14 25,443,505.53   
2. Prime Unsecured       279495      0.06 374,181,368.54 26,571,712.17   
3. Mass Unsecured        437372      0.27 331,460,037.04 22,643,870.90   

                            TSBƒê TB  
SEGMENT                              
1. Rich/Secured    2,014,876,589.99  
2. Prime Unsecured    20,533,948.95  
3. Mass Unsecured      4,948,737.72  


In [7]:
df_seg1.to_csv('seg1.csv', index=False)
df_seg2.to_csv('seg2.csv', index=False)
df_seg3.to_csv('seg3.csv', index=False)

In [8]:
# ==========================================
# 13. T√çNH IV (INFORMATION VALUE) CHO T·ª™NG PH√ÇN KH√öC
# ==========================================
import pandas as pd
import numpy as np

def calculate_iv(df, feature, target):
    # Chia bin ƒë∆°n gi·∫£n (10 bins) ƒë·ªÉ t√≠nh IV nhanh
    try:
        df_bin = pd.DataFrame()
        df_bin['bin'] = pd.qcut(df[feature], q=10, duplicates='drop')
        df_bin['y'] = df[target]
        
        grp = df_bin.groupby('bin')['y'].agg(['count', 'sum'])
        grp['non_event'] = grp['count'] - grp['sum']
        
        # Tr√°nh chia cho 0
        grp['pct_event'] = (grp['sum'] + 0.5) / (grp['sum'].sum() + 0.5)
        grp['pct_non_event'] = (grp['non_event'] + 0.5) / (grp['non_event'].sum() + 0.5)
        
        grp['woe'] = np.log(grp['pct_non_event'] / grp['pct_event'])
        grp['iv'] = (grp['pct_non_event'] - grp['pct_event']) * grp['woe']
        
        return grp['iv'].sum()
    except:
        return 0

exclude_cols = [
    'SOCIF', 'SEGMENT', 'SAMPLE_TYPE', 'year', 'BAD_NEXT_12M', 
    'BAD_CURRENT', 'SNAPSHOT_DATE', 'BAD_EVENT_DATE', 'CURE_DATE',
    'XULYNO', 'MAX_NHOMNOCIC' # Bi·∫øn n√†y t∆∞∆°ng ƒë∆∞∆°ng target, n√™n b·ªè ra
]

# L·∫•y t·∫•t c·∫£ c·ªôt s·ªë
all_numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
features_to_check = [col for col in all_numeric_cols if col not in exclude_cols]

print(f"T·ªïng s·ªë bi·∫øn s·∫Ω qu√©t IV: {len(features_to_check)} bi·∫øn")

# Ch·∫°y v√≤ng l·∫∑p cho 3 ph√¢n kh√∫c
segments = ['1. Rich/Secured', '2. Prime Unsecured', '3. Mass Unsecured']
results = {}

print("\n=== K·∫æT QU·∫¢ IV (S·∫ÆP X·∫æP T·ª™ CAO XU·ªêNG TH·∫§P) ===")

for seg in segments:
    df_sub = df[df['SEGMENT'] == seg]
    iv_scores = {}
    
    for feat in features_to_check:
        iv = calculate_iv(df_sub, feat, 'BAD_NEXT_12M')
        # Ch·ªâ l·∫•y bi·∫øn c√≥ IV > 0.02 (C√≥ ch√∫t √Ω nghƒ©a tr·ªü l√™n)
        if iv > 0.02:
            iv_scores[feat] = iv
    
    # S·∫Øp x·∫øp gi·∫£m d·∫ßn
    top_features = sorted(iv_scores.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\nüîπ PH√ÇN KH√öC: {seg} (Top 10 bi·∫øn m·∫°nh nh·∫•t)")
    for feat, iv in top_features[:10]: # Xem top 10
        strength = "M·∫°nh" if iv > 0.3 else ("Trung b√¨nh" if iv > 0.1 else "Y·∫øu")
        print(f"   - {feat:25} : IV = {iv:.4f} ({strength})")

T·ªïng s·ªë bi·∫øn s·∫Ω qu√©t IV: 43 bi·∫øn

=== K·∫æT QU·∫¢ IV (S·∫ÆP X·∫æP T·ª™ CAO XU·ªêNG TH·∫§P) ===

üîπ PH√ÇN KH√öC: 1. Rich/Secured (Top 10 bi·∫øn m·∫°nh nh·∫•t)
   - MAX_DPD_12M               : IV = 4.6510 (M·∫°nh)
   - MAX_DPD_12M_OBS           : IV = 4.6510 (M·∫°nh)
   - SUM_ALL_OD_12M            : IV = 4.0638 (M·∫°nh)
   - AVG_OD_DPD_12M            : IV = 3.5730 (M·∫°nh)
   - N_AVG_DEPOSIT_12M         : IV = 1.5116 (M·∫°nh)
   - N_AVG_DEPOSIT_6M          : IV = 1.5116 (M·∫°nh)
   - N_AVG_DD_12M              : IV = 1.5116 (M·∫°nh)
   - N_AVG_CD_12M              : IV = 1.5116 (M·∫°nh)
   - N_AVG_OVERDUE_CBAL_12M    : IV = 0.5111 (M·∫°nh)
   - BASE_AUM                  : IV = 0.4895 (M·∫°nh)

üîπ PH√ÇN KH√öC: 2. Prime Unsecured (Top 10 bi·∫øn m·∫°nh nh·∫•t)
   - MAX_DPD_12M               : IV = 3.0615 (M·∫°nh)
   - MAX_DPD_12M_OBS           : IV = 3.0615 (M·∫°nh)
   - SUM_ALL_OD_12M            : IV = 2.8162 (M·∫°nh)
   - AVG_OD_DPD_12M            : IV = 2.5652 (M·∫°nh)
   - 

In [9]:
df.to_csv("gen_data.csv", index=False)