In [685]:
import pandas as pd 
from optbinning import OptimalBinning
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [686]:
df = pd.read_csv('gen_data.csv')

### Tính đầy đủ 

In [687]:
pd.set_option('display.max_rows', None)

Tỷ lệ missing theo cột

In [688]:
missing_col = pd.DataFrame({
    'missing_count': df.isna().sum(),
    'missing_pct': df.isna().mean() * 100
}).sort_values('missing_pct', ascending=False)

missing_col

Unnamed: 0,missing_count,missing_pct
SOCIF,0,0.0
PURCOD_MIN,0,0.0
FLAG_DEPOSIT,0,0.0
CBAL_SHORTTERM_LOAN,0,0.0
CBAL_LONGTERM_LOAN,0,0.0
HAS_SHORTTERM_LOAN,0,0.0
HAS_LONGTERM_LOAN,0,0.0
DURATION_MAX,0,0.0
REMAINING_DURATION_MAX,0,0.0
TIME_TO_OP_MAX,0,0.0


In [689]:
pd.set_option('display.max_rows', 20)

Tỷ lệ missing theo row

In [690]:
df['ROW_MISSING_PCT'] = df.isna().mean(axis=1) * 100

df['ROW_MISSING_PCT'].describe()

count   1,138,327.00
mean            0.00
std             0.00
min             0.00
25%             0.00
50%             0.00
75%             0.00
max             0.00
Name: ROW_MISSING_PCT, dtype: float64

In [691]:
df['FLAG_HIGH_MISSING'] = (df['ROW_MISSING_PCT'] > 40).astype(int)

### Tính duy nhất 

Trùng khóa logic (SOCIF – year)

In [692]:
dup_key = df.duplicated(subset=['SOCIF', 'year'], keep=False)

df.loc[dup_key].shape

(0, 52)

Trùng toàn bộ bản ghi (record duplicate)

In [693]:
dup_full = df.duplicated(keep=False)

dup_full.sum()


np.int64(0)

In [694]:
dup_rate = dup_full.mean() * 100
print(f"Tỷ lệ bản ghi trùng lặp: {dup_rate:.2f}%")


Tỷ lệ bản ghi trùng lặp: 0.00%


### Tính kịp thời 

Check dùng future information (leakage)

In [695]:
# BAD_NEXT_12M chỉ được missing ở năm cuối
leak_check = df[
    (df['year'] < df['year'].max()) &
    (df['BAD_NEXT_12M'].isna())
]

leak_check.shape


(0, 52)

Check continuity theo năm

In [696]:
year_count = df.groupby('year')['SOCIF'].nunique()
year_count


year
2018    200349
2019    204636
2020    155962
2021    160198
2022    206838
2023    210344
Name: SOCIF, dtype: int64

### Tính phù hợp 

In [697]:
rule_valid_age = (
    df['TUOI'].between(0, 120)
)

df.loc[~rule_valid_age].shape


(0, 52)

In [698]:
rule_valid_ltv = df['LTV'].between(0, 400)

df.loc[~rule_valid_ltv].shape


(0, 52)

In [699]:
rule_valid_duration = (
    (df['DURATION_MAX'] >= 0) &
    (df['REMAINING_DURATION_MAX'] >= 0)
)

df.loc[~rule_valid_duration].shape


(0, 52)

In [700]:
valid_gender = df['C_GIOITINH'].isin(['M', 'F', 'O'])
df.loc[~valid_gender].shape


(0, 52)

In [701]:
df['FLAG_INVALID'] = ~(
    rule_valid_age &
    rule_valid_ltv &
    rule_valid_duration &
    valid_gender
)


In [702]:
pd.set_option('display.max_columns', None)

In [703]:
df.describe()

Unnamed: 0,SOCIF,TRINHDO,SOHUUNHA,NHANVIENBIDV,INHERENT_RISK,year,BASE_AUM,CURRENT_RISK,TUOI,INCOME,CBAL,CBALORG,AFLIMT_MAX,AFLIMT_MIN,AFLIMT_AVG,CBAL_AVG,CBAL_MAX,CBAL_MIN,COLLATERAL_VALUE,LTV,N_AVG_DEPOSIT_12M,N_AVG_DEPOSIT_6M,N_AVG_DD_12M,N_AVG_CD_12M,FLAG_SALARY_ACC,FLAG_DEPOSIT,CBAL_SHORTTERM_LOAN,CBAL_LONGTERM_LOAN,HAS_SHORTTERM_LOAN,HAS_LONGTERM_LOAN,DURATION_MAX,REMAINING_DURATION_MAX,TIME_TO_OP_MAX,RATE_AVG,PURCOD_MAX,PURCOD_MIN,MAX_DPD_12M,MAX_DPD_12M_OBS,AVG_OD_DPD_12M,SUM_ALL_OD_12M,BAD_CURRENT,XULYNO,MAX_NHOMNOCIC,N_AVG_OVERDUE_CBAL_12M,CBAL_TO_INC_12MON,REAL_GDP_GROWTH_12M,BAD_NEXT_12M,ROW_MISSING_PCT,FLAG_HIGH_MISSING
count,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0,1138327.0
mean,1199961.6,1.96,0.37,0.01,-0.45,2020.53,15577392.66,-0.42,38.73,24644597.89,356585070.48,392278108.46,771166906.92,694050215.77,732608561.09,356515468.9,428407616.71,285212374.72,752532209.87,132.67,5598486.3,5598486.3,1119696.86,4478789.44,0.91,0.83,178116568.63,178468501.85,1.0,1.0,91.39,47.52,43.87,8.21,4.6,4.6,24.39,24.39,6.86,48.34,0.0,0.0,1.62,166507333.69,14.42,5.66,0.12,0.0,0.0
std,115440.68,0.82,0.48,0.1,0.61,1.77,22781955.16,0.76,9.69,10337725.44,253432718.91,280044694.87,357276408.51,321548767.66,339412588.09,258218905.26,310051887.44,206575124.21,967671702.03,88.01,9899404.16,9899404.16,1979880.83,7919523.32,0.28,0.38,178836718.92,178708929.6,0.0,0.0,66.65,44.66,42.11,2.02,2.54,2.54,24.04,24.04,8.19,52.0,0.0,0.0,0.49,130535861.52,7.53,2.03,0.33,0.0,0.0
min,1000000.0,1.0,0.0,0.0,-1.41,2018.0,49727.0,-3.45,18.0,8434659.0,200882.0,237960.0,240489089.0,216440180.0,228464634.0,221060.0,265272.0,176848.0,39781.0,0.03,13278.0,13278.0,2655.0,10623.0,0.0,0.0,9.0,28.0,1.0,1.0,6.0,0.0,1.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.58,0.0,0.0,0.0
25%,1100216.0,1.0,0.0,0.0,-0.91,2019.0,4345251.0,-0.95,32.0,18148954.5,187436063.5,205614280.0,546773025.5,492095722.5,519434373.5,184988652.5,222429275.5,147990921.5,4771108.0,20.22,1350469.5,1350469.5,270093.5,1080376.0,1.0,1.0,58739301.0,58954187.5,1.0,1.0,36.0,17.0,15.0,6.7,2.0,2.0,5.0,5.0,1.0,8.0,0.0,0.0,1.0,83059301.5,8.75,2.91,0.0,0.0,0.0
50%,1199889.0,2.0,0.0,0.0,-0.57,2021.0,8787907.0,-0.47,39.0,22166739.0,293442580.0,322339312.0,682666499.0,614399849.0,648533174.0,291110424.0,349895246.0,232888339.0,14826793.0,200.0,2838993.0,2838993.0,567798.0,2271195.0,1.0,1.0,127646860.0,128123248.0,1.0,1.0,60.0,31.0,29.0,8.11,6.0,6.0,16.0,16.0,4.0,29.0,0.0,0.0,2.0,138681626.0,13.05,7.02,0.0,0.0,0.0
75%,1299960.0,2.0,1.0,0.0,-0.11,2022.0,18020020.5,0.05,45.0,27198264.0,450082570.0,495113255.0,873329773.5,785996796.0,829663284.5,449965528.0,540671852.0,359972422.0,2005339463.0,200.0,6083645.0,6083645.0,1216729.0,4866916.0,1.0,1.0,238411537.5,239047897.0,1.0,1.0,120.0,60.0,54.0,9.55,6.0,6.0,39.0,39.0,10.0,73.0,0.0,0.0,2.0,217742167.0,18.67,7.08,0.0,0.0,0.0
max,1399999.0,4.0,1.0,1.0,4.92,2023.0,1481280213.0,4.55,75.0,88854689.0,4514879992.0,5180860206.0,6812196139.0,6130976525.0,6471586332.0,4853874181.0,5824649017.0,3883099344.0,3185024170.0,200.0,788156009.0,788156009.0,157631201.0,630524808.0,1.0,1.0,3301651335.0,3471622505.0,1.0,1.0,240.0,227.0,216.0,20.63,9.0,9.0,89.0,89.0,44.0,266.0,0.0,0.0,2.0,2257439996.0,80.63,8.02,1.0,0.0,0.0


In [704]:
pd.set_option('display.max_columns', 20)

In [705]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138327 entries, 0 to 1138326
Data columns (total 53 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   SOCIF                   1138327 non-null  int64  
 1   C_GIOITINH              1138327 non-null  object 
 2   TRINHDO                 1138327 non-null  int64  
 3   TTHONNHAN               1138327 non-null  object 
 4   SOHUUNHA                1138327 non-null  int64  
 5   NHANVIENBIDV            1138327 non-null  int64  
 6   INHERENT_RISK           1138327 non-null  float64
 7   year                    1138327 non-null  int64  
 8   BASE_AUM                1138327 non-null  int64  
 9   CURRENT_RISK            1138327 non-null  float64
 10  TUOI                    1138327 non-null  int64  
 11  INCOME                  1138327 non-null  int64  
 12  CBAL                    1138327 non-null  int64  
 13  CBALORG                 1138327 non-null  int64  
 14  AF

In [706]:
cat_cols = df.select_dtypes(include=['object', 'category']).columns

for col in cat_cols:
    print(f"\n{'='*20} {col} {'='*20}")
    vc = df[col].value_counts(dropna=False)
    pct = df[col].value_counts(normalize=True, dropna=False) * 100
    print(pd.concat([vc, pct.rename('pct_%')], axis=1))


             count  pct_%
C_GIOITINH               
F           560397  49.23
M           555439  48.79
O            22491   1.98

            count  pct_%
TTHONNHAN               
Married    596357  52.39
Single     541970  47.61

              count  pct_%
SAMPLE_TYPE               
TRAIN        721145  63.35
OOT          210344  18.48
OOS          206838  18.17


### Tính chính xác

Tuổi khách hàng

In [707]:
rule_acc_age = df['TUOI'] >= 15

LTV thực tế

In [708]:
rule_acc_ltv = df['LTV'] < 400

Hạn mức – dư nợ

In [709]:
rule_acc_limit = df['AFLIMT_MAX'] >= df['CBAL']

Thu nhập – dư nợ

In [710]:
rule_acc_dti = df['CBAL'] <= df['INCOME'] * 20

CIC – DPD

In [711]:
rule_acc_cic = ~(
    (df['MAX_DPD_12M_OBS'] < 30) & (df['MAX_NHOMNOCIC'] > 1)
) & ~(
    (df['MAX_DPD_12M_OBS'] >= 90) & (df['MAX_NHOMNOCIC'] < 3)
)

Flag & tỷ lệ chính xác

In [712]:
df['FLAG_INACCURATE'] = ~(
    rule_acc_age &
    rule_acc_ltv &
    rule_acc_limit &
    rule_acc_dti &
    rule_acc_cic
)

accuracy_rate = 1 - df['FLAG_INACCURATE'].mean()
accuracy_rate

np.float64(0.565352486587773)

In [713]:
rule_acc_age.sum()

np.int64(1138327)

In [714]:
rule_acc_ltv.sum()

np.int64(1138327)

In [715]:
rule_acc_dti.sum()

np.int64(903090)

In [716]:
rule_acc_cic.sum()

np.int64(810097)

### Tính đồng nhất

CBAL vs cấu trúc kỳ hạn

In [717]:
# rule_cons_balance = (
#     df['CBAL'] ==
#     df['CBAL_SHORTTERM_LOAN']
#     + df['CBAL_MIDTERM_LOAN']
#     + df['CBAL_LONGTERM_LOAN']
# )

# df.loc[~rule_cons_balance].shape

Deposit breakdown

In [718]:
rule_cons_deposit = (
    abs(
        df['N_AVG_DEPOSIT_12M'] -
        (df['N_AVG_DD_12M'] + df['N_AVG_CD_12M'])
    ) <= 1
)
df.loc[~rule_cons_deposit].shape

(0, 54)

DPD logic

In [719]:
rule_cons_dpd = df['MAX_DPD_12M_OBS'] <= df['MAX_DPD_12M']

df.loc[~rule_cons_dpd].shape


(0, 54)

Flag tổng hợp consistency

In [720]:
df['FLAG_INCONSISTENT'] = ~(
    # rule_cons_balance &
    rule_cons_deposit &
    rule_cons_dpd
)

BẢNG TỔNG HỢP (RẤT NÊN CÓ)

In [721]:
quality_summary = pd.DataFrame({
    'Metric': [
        'Completeness',
        'Uniqueness',
        'Timeliness',
        'Validity',
        'Accuracy',
        'Consistency'
    ],
    'Issue_Rate_%': [
        df['FLAG_HIGH_MISSING'].mean() * 100,
        dup_full.mean() * 100,
        leak_check.shape[0] / len(df) * 100,
        df['FLAG_INVALID'].mean() * 100,
        df['FLAG_INACCURATE'].mean() * 100,
        df['FLAG_INCONSISTENT'].mean() * 100
    ]
})

quality_summary

Unnamed: 0,Metric,Issue_Rate_%
0,Completeness,0.0
1,Uniqueness,0.0
2,Timeliness,0.0
3,Validity,0.0
4,Accuracy,43.46
5,Consistency,0.0


In [722]:
df['FLAG_AGE_INVALID'] = (~rule_acc_age).astype(int)
df['FLAG_LTV_HIGH']    = (~rule_acc_ltv).astype(int)
df['FLAG_DTI_HIGH']    = (~rule_acc_dti).astype(int)
df['FLAG_CIC_LOW']     = (~rule_acc_cic).astype(int)
# df['FLAG_INCONSISTENT_DEPOSIT'] = (~rule_cons_deposit).astype(int)

In [723]:
import numpy as np

In [None]:
flag_cols = [c for c in df.columns if c.startswith('FLAG_')]

dq_flag_impact = []

for c in flag_cols:
    tmp = df.groupby(c)['BAD_NEXT_12M'].mean()
    dq_flag_impact.append({
        'FLAG': c,
        'BAD_RATE_0': tmp.get(0, np.nan),
        'BAD_RATE_1': tmp.get(1, np.nan),
        'LIFT': tmp.get(1, np.nan) / tmp.get(0, np.nan)
    })

dq_flag_impact = pd.DataFrame(dq_flag_impact).sort_values('LIFT', ascending=False)
dq_flag_impact

Unnamed: 0,FLAG,BAD_RATE_0,BAD_RATE_1,LIFT
8,FLAG_DTI_HIGH,0.11,0.15,1.29
4,FLAG_INACCURATE,0.14,0.1,0.69
0,FLAG_SALARY_ACC,0.17,0.12,0.66
9,FLAG_CIC_LOW,0.15,0.05,0.36
1,FLAG_DEPOSIT,0.28,0.09,0.31
2,FLAG_HIGH_MISSING,0.12,,
3,FLAG_INVALID,0.12,,
5,FLAG_INCONSISTENT,0.12,,
6,FLAG_AGE_INVALID,0.12,,
7,FLAG_LTV_HIGH,0.12,,


In [725]:
for f in ['FLAG_HIGH_MISSING', 'FLAG_INCONSISTENT']:
    print(f, df[f].value_counts(dropna=False))


FLAG_HIGH_MISSING FLAG_HIGH_MISSING
0    1138327
Name: count, dtype: int64
FLAG_INCONSISTENT FLAG_INCONSISTENT
False    1138327
Name: count, dtype: int64


In [726]:
df.shape

(1138327, 59)

In [727]:
df.drop(['FLAG_DTI_HIGH', 'FLAG_INACCURATE', 'FLAG_CIC_LOW', 'FLAG_HIGH_MISSING', 'FLAG_INCONSISTENT', 'ROW_MISSING_PCT'], axis=1, inplace=True)

In [728]:
df.columns

Index(['SOCIF', 'C_GIOITINH', 'TRINHDO', 'TTHONNHAN', 'SOHUUNHA',
       'NHANVIENBIDV', 'INHERENT_RISK', 'year', 'BASE_AUM', 'CURRENT_RISK',
       'TUOI', 'INCOME', 'CBAL', 'CBALORG', 'AFLIMT_MAX', 'AFLIMT_MIN',
       'AFLIMT_AVG', 'CBAL_AVG', 'CBAL_MAX', 'CBAL_MIN', 'COLLATERAL_VALUE',
       'LTV', 'N_AVG_DEPOSIT_12M', 'N_AVG_DEPOSIT_6M', 'N_AVG_DD_12M',
       'N_AVG_CD_12M', 'FLAG_SALARY_ACC', 'FLAG_DEPOSIT',
       'CBAL_SHORTTERM_LOAN', 'CBAL_LONGTERM_LOAN', 'HAS_SHORTTERM_LOAN',
       'HAS_LONGTERM_LOAN', 'DURATION_MAX', 'REMAINING_DURATION_MAX',
       'TIME_TO_OP_MAX', 'RATE_AVG', 'PURCOD_MAX', 'PURCOD_MIN', 'MAX_DPD_12M',
       'MAX_DPD_12M_OBS', 'AVG_OD_DPD_12M', 'SUM_ALL_OD_12M', 'BAD_CURRENT',
       'XULYNO', 'MAX_NHOMNOCIC', 'N_AVG_OVERDUE_CBAL_12M',
       'CBAL_TO_INC_12MON', 'REAL_GDP_GROWTH_12M', 'BAD_NEXT_12M',
       'SAMPLE_TYPE', 'FLAG_INVALID', 'FLAG_AGE_INVALID', 'FLAG_LTV_HIGH'],
      dtype='object')

In [729]:
df_train = df[df['SAMPLE_TYPE'] == 'TRAIN']
df_train_y = df_train['BAD_NEXT_12M']
df_OOS = df[df['SAMPLE_TYPE'] == 'OOS']
df_OOS_y = df_OOS['BAD_NEXT_12M']
df_OOT = df[df['SAMPLE_TYPE'] == 'OOT']
df_OOT_y = df_OOT['BAD_NEXT_12M']

In [730]:
df_train.drop(['SAMPLE_TYPE','BAD_NEXT_12M'], axis=1, inplace=True)
df_OOS.drop(['SAMPLE_TYPE', 'BAD_NEXT_12M'], axis=1, inplace=True)
df_OOT.drop(['SAMPLE_TYPE', 'BAD_NEXT_12M'], axis=1, inplace=True)

In [731]:
df_train.columns

Index(['SOCIF', 'C_GIOITINH', 'TRINHDO', 'TTHONNHAN', 'SOHUUNHA',
       'NHANVIENBIDV', 'INHERENT_RISK', 'year', 'BASE_AUM', 'CURRENT_RISK',
       'TUOI', 'INCOME', 'CBAL', 'CBALORG', 'AFLIMT_MAX', 'AFLIMT_MIN',
       'AFLIMT_AVG', 'CBAL_AVG', 'CBAL_MAX', 'CBAL_MIN', 'COLLATERAL_VALUE',
       'LTV', 'N_AVG_DEPOSIT_12M', 'N_AVG_DEPOSIT_6M', 'N_AVG_DD_12M',
       'N_AVG_CD_12M', 'FLAG_SALARY_ACC', 'FLAG_DEPOSIT',
       'CBAL_SHORTTERM_LOAN', 'CBAL_LONGTERM_LOAN', 'HAS_SHORTTERM_LOAN',
       'HAS_LONGTERM_LOAN', 'DURATION_MAX', 'REMAINING_DURATION_MAX',
       'TIME_TO_OP_MAX', 'RATE_AVG', 'PURCOD_MAX', 'PURCOD_MIN', 'MAX_DPD_12M',
       'MAX_DPD_12M_OBS', 'AVG_OD_DPD_12M', 'SUM_ALL_OD_12M', 'BAD_CURRENT',
       'XULYNO', 'MAX_NHOMNOCIC', 'N_AVG_OVERDUE_CBAL_12M',
       'CBAL_TO_INC_12MON', 'REAL_GDP_GROWTH_12M', 'FLAG_INVALID',
       'FLAG_AGE_INVALID', 'FLAG_LTV_HIGH'],
      dtype='object')