In [1]:
import pandas as pd 
from optbinning import OptimalBinning
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('gen_data.csv')

### Tính đầy đủ 

In [3]:
pd.set_option('display.max_rows', None)

Tỷ lệ missing theo cột

In [4]:
missing_col = pd.DataFrame({
    'missing_count': df.isna().sum(),
    'missing_pct': df.isna().mean() * 100
}).sort_values('missing_pct', ascending=False)

missing_col

Unnamed: 0,missing_count,missing_pct
SOCIF,0,0.0
N_AVG_DD_12M,0,0.0
FLAG_SALARY_ACC,0,0.0
FLAG_DEPOSIT,0,0.0
CBAL_SHORTTERM_LOAN,0,0.0
CBAL_LONGTERM_LOAN,0,0.0
HAS_SHORTTERM_LOAN,0,0.0
HAS_LONGTERM_LOAN,0,0.0
DURATION_MAX,0,0.0
REMAINING_DURATION_MAX,0,0.0


In [5]:
pd.set_option('display.max_rows', 20)

Tỷ lệ missing theo row

In [6]:
df['ROW_MISSING_PCT'] = df.isna().mean(axis=1) * 100

df['ROW_MISSING_PCT'].describe()

count    1138159.0
mean           0.0
std            0.0
min            0.0
25%            0.0
50%            0.0
75%            0.0
max            0.0
Name: ROW_MISSING_PCT, dtype: float64

In [7]:
df['FLAG_HIGH_MISSING'] = (df['ROW_MISSING_PCT'] > 40).astype(int)

### Tính duy nhất 

Trùng khóa logic (SOCIF – year)

In [8]:
dup_key = df.duplicated(subset=['SOCIF', 'year'], keep=False)

df.loc[dup_key].shape

(0, 55)

Trùng toàn bộ bản ghi (record duplicate)

In [9]:
dup_full = df.duplicated(keep=False)

dup_full.sum()


np.int64(0)

In [10]:
dup_rate = dup_full.mean() * 100
print(f"Tỷ lệ bản ghi trùng lặp: {dup_rate:.2f}%")


Tỷ lệ bản ghi trùng lặp: 0.00%


### Tính kịp thời 

Check dùng future information (leakage)

In [11]:
# BAD_NEXT_12M chỉ được missing ở năm cuối
leak_check = df[
    (df['year'] < df['year'].max()) &
    (df['BAD_NEXT_12M'].isna())
]

leak_check.shape


(0, 55)

Check continuity theo năm

In [12]:
year_count = df.groupby('year')['SOCIF'].nunique()
year_count


year
2018    199938
2019    204410
2020    156132
2021    160135
2022    206641
2023    210903
Name: SOCIF, dtype: int64

### Tính phù hợp 

In [13]:
rule_valid_age = (
    df['TUOI'].between(0, 120)
)

df.loc[~rule_valid_age].shape


(0, 55)

In [14]:
rule_valid_ltv = df['LTV'].between(0, 400)

df.loc[~rule_valid_ltv].shape


(0, 55)

In [15]:
rule_valid_duration = (
    (df['DURATION_MAX'] >= 0) &
    (df['REMAINING_DURATION_MAX'] >= 0)
)

df.loc[~rule_valid_duration].shape


(0, 55)

In [16]:
valid_gender = df['C_GIOITINH'].isin(['M', 'F', 'O'])
df.loc[~valid_gender].shape

(0, 55)

In [17]:
df['FLAG_INVALID'] = ~(
    rule_valid_age &
    rule_valid_ltv &
    rule_valid_duration &
    valid_gender
)

In [18]:
pd.set_option('display.max_columns', None)

In [19]:
df.describe()

Unnamed: 0,SOCIF,TRINHDO,SOHUUNHA,NHANVIENBIDV,INHERENT_RISK,REF_MONTH,REF_DAY,year,BASE_AUM,CURRENT_RISK,TUOI,INCOME,CBAL,CBALORG,AFLIMT_MAX,AFLIMT_MIN,AFLIMT_AVG,CBAL_AVG,CBAL_MAX,CBAL_MIN,COLLATERAL_VALUE,LTV,N_AVG_DEPOSIT_12M,N_AVG_DEPOSIT_6M,N_AVG_DD_12M,N_AVG_CD_12M,FLAG_SALARY_ACC,FLAG_DEPOSIT,CBAL_SHORTTERM_LOAN,CBAL_LONGTERM_LOAN,HAS_SHORTTERM_LOAN,HAS_LONGTERM_LOAN,DURATION_MAX,REMAINING_DURATION_MAX,TIME_TO_OP_MAX,RATE_AVG,PURCOD_MAX,PURCOD_MIN,MAX_DPD_12M,MAX_DPD_12M_OBS,AVG_OD_DPD_12M,SUM_ALL_OD_12M,BAD_CURRENT,XULYNO,MAX_NHOMNOCIC,N_AVG_OVERDUE_CBAL_12M,CBAL_TO_INC_12MON,REAL_GDP_GROWTH_12M,BAD_NEXT_12M,ROW_MISSING_PCT,FLAG_HIGH_MISSING
count,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0,1138159.0
mean,1200010.0,1.96298,0.3701521,0.0101752,-0.4510839,6.503928,14.51262,2020.529,15563470.0,-0.4226918,38.72472,24644710.0,356561500.0,392243800.0,771156500.0,694040900.0,732598700.0,356464800.0,428355200.0,285171900.0,752755100.0,132.6504,5593652.0,5593652.0,1118730.0,4474922.0,0.9130236,0.8283632,178094200.0,178467300.0,1.0,1.0,91.34755,47.47696,43.87059,8.20932,4.599582,4.599582,24.44912,24.44912,6.868355,48.41773,0.0,0.0,1.61801,166460900.0,14.42311,5.658553,0.08424306,0.0,0.0
std,115383.9,0.8243087,0.4828455,0.1003578,0.6123859,3.452058,8.088168,1.771243,22850070.0,0.755922,9.699934,10342980.0,253616000.0,280157400.0,358018100.0,322216300.0,340117200.0,258386300.0,310266900.0,206709000.0,967721500.0,88.00866,9936494.0,9936494.0,1987299.0,7949195.0,0.2818006,0.3770647,178574200.0,179159900.0,0.0,0.0,66.62703,44.69349,42.07309,2.019898,2.538326,2.538326,24.08371,24.08371,8.20005,52.01602,0.0,0.0,0.4858742,130604700.0,7.527059,2.034374,0.2777521,0.0,0.0
min,1000000.0,1.0,0.0,0.0,-1.414093,1.0,1.0,2018.0,42865.0,-3.417804,18.0,9490482.0,203239.0,218077.0,244790300.0,220311200.0,232550800.0,241415.0,289698.0,193132.0,34292.0,0.01015334,9742.0,9742.0,1948.0,7794.0,0.0,0.0,3.0,14.0,1.0,1.0,6.0,0.0,1.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.01559263,2.58,0.0,0.0,0.0
25%,1100335.0,1.0,0.0,0.0,-0.9131795,4.0,7.0,2019.0,4347134.0,-0.9459798,32.0,18141940.0,187427800.0,205622600.0,546426500.0,491783800.0,519105200.0,184932200.0,222386500.0,147945800.0,4774380.0,20.23715,1350567.0,1350567.0,270113.0,1080454.0,1.0,1.0,58759680.0,58823500.0,1.0,1.0,36.0,17.0,15.0,6.701521,2.0,2.0,5.0,5.0,1.0,8.0,0.0,0.0,1.0,83041660.0,8.748897,2.91,0.0,0.0,0.0
50%,1200003.0,2.0,0.0,0.0,-0.5662085,7.0,15.0,2021.0,8779688.0,-0.468453,39.0,22162470.0,293472800.0,322324700.0,682607200.0,614346500.0,648476900.0,291193600.0,350048300.0,232954900.0,14846670.0,200.0,2834877.0,2834877.0,566975.0,2267902.0,1.0,1.0,127960700.0,127966000.0,1.0,1.0,60.0,31.0,29.0,8.107369,6.0,6.0,16.0,16.0,4.0,29.0,0.0,0.0,2.0,138691500.0,13.05962,7.02,0.0,0.0,0.0
75%,1299878.0,2.0,1.0,0.0,-0.1107739,9.0,22.0,2022.0,17975030.0,0.04872816,45.0,27191650.0,449687100.0,494598100.0,872866100.0,785579500.0,829222800.0,449721100.0,540402000.0,359776900.0,2005353000.0,200.0,6056800.0,6056800.0,1211360.0,4845440.0,1.0,1.0,238689700.0,239107700.0,1.0,1.0,120.0,59.0,54.0,9.550126,6.0,6.0,39.0,39.0,10.0,73.0,0.0,0.0,2.0,217533700.0,18.65218,7.08,0.0,0.0,0.0
max,1399999.0,4.0,1.0,1.0,4.870131,12.0,28.0,2023.0,1036090000.0,4.977884,75.0,88040820.0,4628647000.0,5348516000.0,7591824000.0,6832642000.0,7212233000.0,5120370000.0,6144445000.0,4096296000.0,2828872000.0,200.0,692326600.0,692326600.0,138465300.0,553861300.0,1.0,1.0,4067473000.0,3652844000.0,1.0,1.0,240.0,227.0,216.0,20.23361,9.0,9.0,89.0,89.0,44.0,266.0,0.0,0.0,2.0,2314323000.0,88.39292,8.02,1.0,0.0,0.0


In [20]:
pd.set_option('display.max_columns', 20)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138159 entries, 0 to 1138158
Data columns (total 56 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   SOCIF                   1138159 non-null  int64  
 1   C_GIOITINH              1138159 non-null  object 
 2   TRINHDO                 1138159 non-null  int64  
 3   TTHONNHAN               1138159 non-null  object 
 4   SOHUUNHA                1138159 non-null  int64  
 5   NHANVIENBIDV            1138159 non-null  int64  
 6   INHERENT_RISK           1138159 non-null  float64
 7   REF_MONTH               1138159 non-null  int64  
 8   REF_DAY                 1138159 non-null  int64  
 9   year                    1138159 non-null  int64  
 10  BASE_AUM                1138159 non-null  int64  
 11  CURRENT_RISK            1138159 non-null  float64
 12  TUOI                    1138159 non-null  int64  
 13  SNAPSHOT_DATE           1138159 non-null  object 
 14  IN

In [22]:
cat_cols = df.select_dtypes(include=['object', 'category']).columns

for col in cat_cols:
    print(f"\n{'='*20} {col} {'='*20}")
    vc = df[col].value_counts(dropna=False)
    pct = df[col].value_counts(normalize=True, dropna=False) * 100
    print(pd.concat([vc, pct.rename('pct_%')], axis=1))


             count      pct_%
C_GIOITINH                   
F           559790  49.183814
M           555844  48.837113
O            22525   1.979073

            count      pct_%
TTHONNHAN                   
Married    595719  52.340578
Single     542440  47.659422

               count     pct_%
SNAPSHOT_DATE                 
2023-12-08       688  0.060448
2023-10-11       688  0.060448
2023-05-12       688  0.060448
2022-02-18       683  0.060009
2023-02-18       682  0.059921
...              ...       ...
2020-03-12       417  0.036638
2020-03-09       415  0.036462
2020-11-17       412  0.036199
2020-07-05       412  0.036199
2020-06-18       411  0.036111

[2016 rows x 2 columns]

              count      pct_%
SAMPLE_TYPE                   
TRAIN        720615  63.314089
OOT          210903  18.530188
OOS          206641  18.155723


### Tính chính xác

Tuổi khách hàng

In [23]:
rule_acc_age = df['TUOI'] >= 15

LTV thực tế

In [24]:
rule_acc_ltv = df['LTV'] < 400

Hạn mức – dư nợ

In [25]:
rule_acc_limit = df['AFLIMT_MAX'] >= df['CBAL']

Thu nhập – dư nợ

In [26]:
rule_acc_dti = df['CBAL'] <= df['INCOME'] * 20

CIC – DPD

In [27]:
rule_acc_cic = ~(
    (df['MAX_DPD_12M_OBS'] < 30) & (df['MAX_NHOMNOCIC'] > 1)
) & ~(
    (df['MAX_DPD_12M_OBS'] >= 90) & (df['MAX_NHOMNOCIC'] < 3)
)

Flag & tỷ lệ chính xác

In [28]:
df['FLAG_INACCURATE'] = ~(
    rule_acc_age &
    rule_acc_ltv &
    rule_acc_limit &
    rule_acc_dti &
    rule_acc_cic
)

accuracy_rate = 1 - df['FLAG_INACCURATE'].mean()
accuracy_rate

np.float64(0.565977161363219)

In [29]:
rule_acc_age.sum()

np.int64(1138159)

In [30]:
rule_acc_ltv.sum()

np.int64(1138159)

In [31]:
rule_acc_dti.sum()

np.int64(903249)

In [32]:
rule_acc_cic.sum()

np.int64(810189)

### Tính đồng nhất

CBAL vs cấu trúc kỳ hạn

In [33]:
# rule_cons_balance = (
#     df['CBAL'] ==
#     df['CBAL_SHORTTERM_LOAN']
#     + df['CBAL_MIDTERM_LOAN']
#     + df['CBAL_LONGTERM_LOAN']
# )

# df.loc[~rule_cons_balance].shape

Deposit breakdown

In [34]:
rule_cons_deposit = (
    abs(
        df['N_AVG_DEPOSIT_12M'] -
        (df['N_AVG_DD_12M'] + df['N_AVG_CD_12M'])
    ) <= 1
)
df.loc[~rule_cons_deposit].shape

(0, 57)

DPD logic

In [35]:
rule_cons_dpd = df['MAX_DPD_12M_OBS'] <= df['MAX_DPD_12M']

df.loc[~rule_cons_dpd].shape


(0, 57)

Flag tổng hợp consistency

In [36]:
df['FLAG_INCONSISTENT'] = ~(
    # rule_cons_balance &
    rule_cons_deposit &
    rule_cons_dpd
)

BẢNG TỔNG HỢP (RẤT NÊN CÓ)

In [37]:
quality_summary = pd.DataFrame({
    'Metric': [
        'Completeness',
        'Uniqueness',
        'Timeliness',
        'Validity',
        'Accuracy',
        'Consistency'
    ],
    'Issue_Rate_%': [
        df['FLAG_HIGH_MISSING'].mean() * 100,
        dup_full.mean() * 100,
        leak_check.shape[0] / len(df) * 100,
        df['FLAG_INVALID'].mean() * 100,
        df['FLAG_INACCURATE'].mean() * 100,
        df['FLAG_INCONSISTENT'].mean() * 100
    ]
})

quality_summary

Unnamed: 0,Metric,Issue_Rate_%
0,Completeness,0.0
1,Uniqueness,0.0
2,Timeliness,0.0
3,Validity,0.0
4,Accuracy,43.402284
5,Consistency,0.0


In [38]:
df['FLAG_AGE_INVALID'] = (~rule_acc_age).astype(int)
df['FLAG_LTV_HIGH']    = (~rule_acc_ltv).astype(int)
df['FLAG_DTI_HIGH']    = (~rule_acc_dti).astype(int)
df['FLAG_CIC_LOW']     = (~rule_acc_cic).astype(int)
# df['FLAG_INCONSISTENT_DEPOSIT'] = (~rule_cons_deposit).astype(int)

In [39]:
import numpy as np

In [40]:
flag_cols = [c for c in df.columns if c.startswith('FLAG_')]

dq_flag_impact = []

for c in flag_cols:
    tmp = df.groupby(c)['BAD_NEXT_12M'].mean()
    dq_flag_impact.append({
        'FLAG': c,
        'BAD_RATE_0': tmp.get(0, np.nan),
        'BAD_RATE_1': tmp.get(1, np.nan),
        'LIFT': tmp.get(1, np.nan) / tmp.get(0, np.nan)
    })

dq_flag_impact = pd.DataFrame(dq_flag_impact).sort_values('LIFT', ascending=False)
dq_flag_impact

Unnamed: 0,FLAG,BAD_RATE_0,BAD_RATE_1,LIFT
8,FLAG_DTI_HIGH,0.079144,0.103848,1.312139
4,FLAG_INACCURATE,0.097212,0.067332,0.692631
0,FLAG_SALARY_ACC,0.127272,0.080144,0.629709
9,FLAG_CIC_LOW,0.104162,0.035037,0.336367
1,FLAG_DEPOSIT,0.20795,0.058611,0.281852
2,FLAG_HIGH_MISSING,0.084243,,
3,FLAG_INVALID,0.084243,,
5,FLAG_INCONSISTENT,0.084243,,
6,FLAG_AGE_INVALID,0.084243,,
7,FLAG_LTV_HIGH,0.084243,,


In [41]:
for f in ['FLAG_HIGH_MISSING', 'FLAG_INCONSISTENT']:
    print(f, df[f].value_counts(dropna=False))


FLAG_HIGH_MISSING FLAG_HIGH_MISSING
0    1138159
Name: count, dtype: int64
FLAG_INCONSISTENT FLAG_INCONSISTENT
False    1138159
Name: count, dtype: int64


In [42]:
df.shape

(1138159, 62)

In [43]:
df.drop(['FLAG_DTI_HIGH', 'FLAG_INACCURATE', 'FLAG_CIC_LOW', 'FLAG_HIGH_MISSING', 'FLAG_INCONSISTENT', 'ROW_MISSING_PCT'], axis=1, inplace=True)

In [44]:
df.columns

Index(['SOCIF', 'C_GIOITINH', 'TRINHDO', 'TTHONNHAN', 'SOHUUNHA',
       'NHANVIENBIDV', 'INHERENT_RISK', 'REF_MONTH', 'REF_DAY', 'year',
       'BASE_AUM', 'CURRENT_RISK', 'TUOI', 'SNAPSHOT_DATE', 'INCOME', 'CBAL',
       'CBALORG', 'AFLIMT_MAX', 'AFLIMT_MIN', 'AFLIMT_AVG', 'CBAL_AVG',
       'CBAL_MAX', 'CBAL_MIN', 'COLLATERAL_VALUE', 'LTV', 'N_AVG_DEPOSIT_12M',
       'N_AVG_DEPOSIT_6M', 'N_AVG_DD_12M', 'N_AVG_CD_12M', 'FLAG_SALARY_ACC',
       'FLAG_DEPOSIT', 'CBAL_SHORTTERM_LOAN', 'CBAL_LONGTERM_LOAN',
       'HAS_SHORTTERM_LOAN', 'HAS_LONGTERM_LOAN', 'DURATION_MAX',
       'REMAINING_DURATION_MAX', 'TIME_TO_OP_MAX', 'RATE_AVG', 'PURCOD_MAX',
       'PURCOD_MIN', 'MAX_DPD_12M', 'MAX_DPD_12M_OBS', 'AVG_OD_DPD_12M',
       'SUM_ALL_OD_12M', 'BAD_CURRENT', 'XULYNO', 'MAX_NHOMNOCIC',
       'N_AVG_OVERDUE_CBAL_12M', 'CBAL_TO_INC_12MON', 'REAL_GDP_GROWTH_12M',
       'BAD_NEXT_12M', 'SAMPLE_TYPE', 'FLAG_INVALID', 'FLAG_AGE_INVALID',
       'FLAG_LTV_HIGH'],
      dtype='object')

In [45]:
df_train = df[df['SAMPLE_TYPE'] == 'TRAIN']
df_train_y = df_train['BAD_NEXT_12M']
df_OOS = df[df['SAMPLE_TYPE'] == 'OOS']
df_OOS_y = df_OOS['BAD_NEXT_12M']
df_OOT = df[df['SAMPLE_TYPE'] == 'OOT']
df_OOT_y = df_OOT['BAD_NEXT_12M']

In [46]:
df_train.drop(['SAMPLE_TYPE','BAD_NEXT_12M'], axis=1, inplace=True)
df_OOS.drop(['SAMPLE_TYPE', 'BAD_NEXT_12M'], axis=1, inplace=True)
df_OOT.drop(['SAMPLE_TYPE', 'BAD_NEXT_12M'], axis=1, inplace=True)

In [47]:
df_train.columns

Index(['SOCIF', 'C_GIOITINH', 'TRINHDO', 'TTHONNHAN', 'SOHUUNHA',
       'NHANVIENBIDV', 'INHERENT_RISK', 'REF_MONTH', 'REF_DAY', 'year',
       'BASE_AUM', 'CURRENT_RISK', 'TUOI', 'SNAPSHOT_DATE', 'INCOME', 'CBAL',
       'CBALORG', 'AFLIMT_MAX', 'AFLIMT_MIN', 'AFLIMT_AVG', 'CBAL_AVG',
       'CBAL_MAX', 'CBAL_MIN', 'COLLATERAL_VALUE', 'LTV', 'N_AVG_DEPOSIT_12M',
       'N_AVG_DEPOSIT_6M', 'N_AVG_DD_12M', 'N_AVG_CD_12M', 'FLAG_SALARY_ACC',
       'FLAG_DEPOSIT', 'CBAL_SHORTTERM_LOAN', 'CBAL_LONGTERM_LOAN',
       'HAS_SHORTTERM_LOAN', 'HAS_LONGTERM_LOAN', 'DURATION_MAX',
       'REMAINING_DURATION_MAX', 'TIME_TO_OP_MAX', 'RATE_AVG', 'PURCOD_MAX',
       'PURCOD_MIN', 'MAX_DPD_12M', 'MAX_DPD_12M_OBS', 'AVG_OD_DPD_12M',
       'SUM_ALL_OD_12M', 'BAD_CURRENT', 'XULYNO', 'MAX_NHOMNOCIC',
       'N_AVG_OVERDUE_CBAL_12M', 'CBAL_TO_INC_12MON', 'REAL_GDP_GROWTH_12M',
       'FLAG_INVALID', 'FLAG_AGE_INVALID', 'FLAG_LTV_HIGH'],
      dtype='object')