In [None]:
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_parquet('gen_data.parquet')

### Tính đầy đủ 

In [3]:
pd.set_option('display.max_rows', None)

Tỷ lệ missing theo cột

In [4]:
missing_col = pd.DataFrame({
    'missing_count': df.isna().sum(),
    'missing_pct': df.isna().mean() * 100
}).sort_values('missing_pct', ascending=False)

missing_col

Unnamed: 0,missing_count,missing_pct
COLLATERAL_VALUE,475839,19.826625
INCOME,120000,5.0
TRINHDO,72000,3.0
SOCIF,0,0.0
CBAL_LONGTERM_LOAN,0,0.0
OCCUPATION_TYPE,0,0.0
CNT_DPD_30PLUS_6M,0,0.0
HAS_LONGTERM_LOAN,0,0.0
HAS_SHORTTERM_LOAN,0,0.0
CBAL_SHORTTERM_LOAN,0,0.0


In [5]:
pd.set_option('display.max_rows', 20)

Tỷ lệ missing theo row

In [6]:
df['ROW_MISSING_PCT'] = df.isna().mean(axis=1) * 100

df['ROW_MISSING_PCT'].describe()

count    2.400000e+06
mean     4.092151e-01
std      7.138921e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.470588e+00
max      4.411765e+00
Name: ROW_MISSING_PCT, dtype: float64

In [7]:
df['FLAG_HIGH_MISSING'] = (df['ROW_MISSING_PCT'] > 40).astype(int)

### Tính duy nhất 

Trùng khóa logic (SOCIF – year)

In [8]:
dup_key = df.duplicated(subset=['SOCIF', 'year'], keep=False)

df.loc[dup_key].shape

(0, 70)

Trùng toàn bộ bản ghi (record duplicate)

In [9]:
dup_full = df.duplicated(keep=False)

dup_full.sum()


np.int64(0)

In [10]:
dup_rate = dup_full.mean() * 100
print(f"Tỷ lệ bản ghi trùng lặp: {dup_rate:.2f}%")


Tỷ lệ bản ghi trùng lặp: 0.00%


### Tính kịp thời 

Check dùng future information (leakage)

In [11]:
# BAD_NEXT_12M chỉ được missing ở năm cuối
leak_check = df[
    (df['year'] < df['year'].max()) &
    (df['BAD_NEXT_12M'].isna())
]

leak_check.shape


(0, 70)

Check continuity theo năm

In [12]:
year_count = df.groupby('year')['SOCIF'].nunique()
year_count

year
2018    400000
2019    400000
2020    400000
2021    400000
2022    400000
2023    400000
Name: SOCIF, dtype: int64

### Tính phù hợp 

In [13]:
rule_valid_age = (
    df['TUOI'].between(0, 120)
)

df.loc[~rule_valid_age].shape


(0, 70)

In [14]:
rule_valid_ltv = df['LTV'].between(0, 400)

df.loc[~rule_valid_ltv].shape


(0, 70)

In [15]:
rule_valid_duration = (
    (df['DURATION_MAX'] >= 0) &
    (df['REMAINING_DURATION_MAX'] >= 0)
)

df.loc[~rule_valid_duration].shape


(0, 70)

In [16]:
valid_gender = df['C_GIOITINH'].isin(['M', 'F', 'O'])
df.loc[~valid_gender].shape

(0, 70)

In [17]:
df['FLAG_INVALID'] = ~(
    rule_valid_age &
    rule_valid_ltv &
    rule_valid_duration &
    valid_gender
)

In [18]:
pd.set_option('display.max_columns', None)

In [19]:
df.describe()

Unnamed: 0,SOCIF,TRINHDO,SOHUUNHA,NHANVIENBIDV,INHERENT_RISK,REF_MONTH,REF_DAY,year,BASE_AUM,CURRENT_RISK,TUOI,SNAPSHOT_DATE,INCOME,CBAL,CBALORG,AFLIMT_MAX,AFLIMT_MIN,AFLIMT_AVG,CBAL_AVG,CBAL_MAX,CBAL_MIN,COLLATERAL_VALUE,LTV,N_AVG_DEPOSIT_12M,N_AVG_DEPOSIT_6M,N_AVG_DD_12M,N_AVG_CD_12M,FLAG_SALARY_ACC,FLAG_DEPOSIT,UTILIZATION_RATE,CNT_CREDIT_CARDS,AMT_CASH_ADVANCE_12M,FLAG_CASH_ADVANCE,PCT_PAYMENT_TO_BALANCE,CNT_MIN_PAY_6M,AVG_DAYS_PAST_DUE,DTI_RATIO,PTI_RATIO,MOB,CNT_OTHER_PRODUCTS,LIMIT_TO_INCOME,AMT_VAR_6M,CBAL_SHORTTERM_LOAN,CBAL_LONGTERM_LOAN,HAS_SHORTTERM_LOAN,HAS_LONGTERM_LOAN,CNT_DPD_30PLUS_6M,OCCUPATION_TYPE,DURATION_MAX,REMAINING_DURATION_MAX,TIME_TO_OP_MAX,RATE_AVG,PURCOD_MAX,PURCOD_MIN,MAX_DPD_12M,MAX_DPD_12M_OBS,AVG_OD_DPD_12M,SUM_ALL_OD_12M,BAD_CURRENT,XULYNO,MAX_NHOMNOCIC,N_AVG_OVERDUE_CBAL_12M,CBAL_TO_INC_12MON,REAL_GDP_GROWTH_12M,BAD_NEXT_12M,ROW_MISSING_PCT,FLAG_HIGH_MISSING
count,2400000.0,2328000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000,2280000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,1924161.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0,2400000.0
mean,1200000.0,1.8558,0.3382225,0.0102975,7.010688e-17,6.498735,14.5208,2020.5,10400780.0,0.1665974,37.20353,2020-12-29 02:54:56.159998976,22628600.0,329764200.0,362754800.0,716308300.0,644677500.0,680492900.0,329715700.0,396212000.0,263772500.0,852322100.0,120.7265,3240266.0,3240266.0,648052.7,2592213.0,0.8624429,0.5463367,0.4356772,1.494581,59749190.0,0.181825,1.067599,0.9637629,2.372091,1.170814,0.8601475,62.51819,0.5988854,31.60412,0.0001089114,164878800.0,164885400.0,0.8747633,0.8747633,0.2593213,2.002271,82.11065,42.62277,39.48788,9.366556,4.597413,4.597413,88.44162,88.44162,0.0,176.557,0.1767304,0.0723475,1.84441,133023100.0,14.04976,5.443333,0.1308383,0.4092151,0.0
min,1000000.0,1.0,0.0,0.0,-1.414093,1.0,1.0,2018.0,19766.0,-3.417804,18.0,2018-01-01 00:00:00,8036520.0,0.0,0.0,207030000.0,186327000.0,196678500.0,0.0,0.0,0.0,21172.0,0.0,74.0,74.0,14.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,6.0,0.0,24.0,-1.031925,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,1.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.58,0.0,0.0,0.0
25%,1100000.0,1.0,0.0,0.0,-0.7346683,4.0,7.75,2019.0,2352022.0,-0.6313609,30.0,2019-06-28 00:00:00,16734880.0,162387900.0,177894000.0,504288100.0,453859300.0,479073700.0,159375100.0,191693000.0,127500100.0,3001943.0,14.30676,426522.5,426522.5,85304.0,341218.5,1.0,0.0,0.2690217,1.0,0.0,0.0,0.8233325,0.0,0.0,0.6758605,0.4532833,34.0,0.0,27.52027,-0.134939,37353440.0,37228860.0,1.0,1.0,0.0,1.0,36.0,13.0,11.0,7.452226,2.0,2.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,8.110325,2.91,0.0,0.0,0.0
50%,1200000.0,2.0,0.0,0.0,-0.2286553,6.0,15.0,2020.5,5215838.0,0.002903157,37.0,2020-12-30 00:00:00,20311910.0,282037400.0,309679100.0,629299300.0,566369400.0,597834300.0,279435500.0,335903200.0,223548400.0,13286780.0,200.0,1190874.0,1190874.0,238174.0,952700.0,1.0,1.0,0.4454427,1.0,0.0,0.0,1.077865,0.0,0.0,1.111008,0.7590277,63.0,0.0,30.84363,0.0001673022,115742800.0,115789500.0,1.0,1.0,0.0,2.0,60.0,27.0,25.0,9.114593,6.0,6.0,8.0,8.0,0.0,14.0,0.0,0.0,1.0,112309800.0,13.33209,6.035,0.0,0.0,0.0
75%,1299999.0,2.0,1.0,0.0,0.4921627,9.0,22.0,2022.0,11536070.0,0.7871727,44.0,2022-06-28 00:00:00,25204320.0,441303300.0,485289600.0,811022400.0,729920100.0,770471300.0,441008700.0,529911000.0,352806900.0,2004105000.0,200.0,3199729.0,3199729.0,639945.5,2559783.0,1.0,1.0,0.6306674,2.0,0.0,0.0,1.323501,1.0,2.0,1.619395,1.19193,91.0,1.0,34.06661,0.1350352,231331500.0,231267400.0,1.0,1.0,0.0,3.0,120.0,54.0,50.0,10.96344,6.0,6.0,45.0,45.0,0.0,86.0,0.0,0.0,2.0,197114800.0,19.43274,7.08,0.0,1.470588,0.0
max,1399999.0,4.0,1.0,1.0,9.261991,12.0,28.0,2023.0,1036090000.0,10.38149,75.0,2023-12-28 00:00:00,88040820.0,6441938000.0,7057989000.0,9811712000.0,8830541000.0,9321127000.0,7417343000.0,8900812000.0,5933875000.0,2828872000.0,200.0,692326600.0,692326600.0,138465300.0,553861300.0,1.0,1.0,0.9993465,5.0,5356833000.0,1.0,2.802017,6.0,259.0,9.039584,2.0,119.0,3.0,141.4493,1.033878,6150340000.0,3822588000.0,1.0,1.0,14.0,4.0,240.0,227.0,216.0,25.0,9.0,9.0,900.0,900.0,0.0,2699.0,1.0,1.0,5.0,3220969000.0,108.475,8.02,1.0,4.411765,0.0
std,115470.1,0.7942454,0.4731048,0.1009528,1.0,3.451478,8.088572,1.707825,17597770.0,1.142452,9.734701,,9512164.0,269116800.0,297068400.0,341720500.0,307548400.0,324634500.0,273453000.0,328395400.0,218762400.0,989385400.0,92.52269,7445974.0,7445974.0,1489195.0,5956779.0,0.3444346,0.4978484,0.2462469,1.20517,161215900.0,0.3857003,0.3735395,1.660909,6.324145,0.7598772,0.5736481,32.91515,0.8596236,6.185877,0.2001026,182269600.0,182223300.0,0.3309871,0.3309871,0.6576934,1.000679,67.33142,43.92247,41.22742,2.670365,2.536789,2.536789,211.5578,211.5578,0.0,443.5798,0.3814405,0.2590625,1.202726,137396400.0,9.118526,2.104556,0.3372235,0.7138921,0.0


In [20]:
pd.set_option('display.max_columns', 20)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400000 entries, 0 to 2399999
Data columns (total 71 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   SOCIF                   int64         
 1   C_GIOITINH              object        
 2   TRINHDO                 float64       
 3   TTHONNHAN               object        
 4   SOHUUNHA                int64         
 5   NHANVIENBIDV            int64         
 6   INHERENT_RISK           float64       
 7   REF_MONTH               int32         
 8   REF_DAY                 int32         
 9   year                    int64         
 10  BASE_AUM                int64         
 11  CURRENT_RISK            float64       
 12  TUOI                    int64         
 13  SNAPSHOT_DATE           datetime64[ns]
 14  INCOME                  float64       
 15  CBAL                    int64         
 16  CBALORG                 int64         
 17  AFLIMT_MAX              int64         
 18  AF

In [22]:
cat_cols = df.select_dtypes(include=['object', 'category']).columns

for col in cat_cols:
    print(f"\n{'='*20} {col} {'='*20}")
    vc = df[col].value_counts(dropna=False)
    pct = df[col].value_counts(normalize=True, dropna=False) * 100
    print(pd.concat([vc, pct.rename('pct_%')], axis=1))


              count     pct_%
C_GIOITINH                   
F           1176654  49.02725
M           1175634  48.98475
O             47712   1.98800

             count   pct_%
TTHONNHAN                 
Single     1311000  54.625
Married    1089000  45.375

               count      pct_%
SAMPLE_TYPE                    
TRAIN        1600000  66.666667
OOS           400000  16.666667
OOT           400000  16.666667


### Tính chính xác

Tuổi khách hàng

In [23]:
rule_acc_age = df['TUOI'].between(18, 80)

LTV thực tế

In [24]:
rule_acc_ltv = df['LTV'] >= 0

Hạn mức – dư nợ

In [25]:
rule_acc_amount = (df['AFLIMT_MAX'] >= 0) & (df['CBAL'] >= 0) & (df['INCOME'] >= 0)

In [26]:
rule_acc_limit = df['AFLIMT_MAX'] >= 0 

Thu nhập – dư nợ

In [27]:
rule_acc_dti = df['DTI_RATIO'].between(0, 100)

CIC – DPD

In [28]:
rule_acc_cic = ~((df['MAX_DPD_12M_OBS'] >= 90) & (df['MAX_NHOMNOCIC'] < 3))

Flag & tỷ lệ chính xác

In [29]:
df['FLAG_INACCURATE'] = ~(
    rule_acc_age &
    rule_acc_ltv &
    rule_acc_limit &
    rule_acc_amount &
    rule_acc_dti &
    rule_acc_cic
)

accuracy_rate = 1 - df['FLAG_INACCURATE'].mean()
accuracy_rate

np.float64(0.95)

In [30]:
rule_acc_age.sum()

np.int64(2400000)

In [31]:
rule_acc_ltv.sum()

np.int64(2400000)

In [32]:
rule_acc_dti.sum()

np.int64(2400000)

In [33]:
rule_acc_cic.sum()

np.int64(2400000)

### Tính đồng nhất

CBAL vs cấu trúc kỳ hạn

In [34]:
import numpy as np
rule_cons_balance = np.isclose(
    df['CBAL'], 
    df['CBAL_SHORTTERM_LOAN'] + df['CBAL_LONGTERM_LOAN'], 
    atol=1.0 # Cho phép sai số 1 đơn vị tiền tệ
)

df.loc[~rule_cons_balance].shape

(0, 72)

Deposit breakdown

In [35]:
rule_cons_deposit = (
    abs(
        df['N_AVG_DEPOSIT_12M'] -
        (df['N_AVG_DD_12M'] + df['N_AVG_CD_12M'])
    ) <= 1
)
df.loc[~rule_cons_deposit].shape

(0, 72)

DPD logic

In [36]:
rule_cons_dpd = df['MAX_DPD_12M_OBS'] <= df['MAX_DPD_12M']

df.loc[~rule_cons_dpd].shape


(0, 72)

In [37]:
rule_struct_missing = ~(
    (df['MOB'] < 3) & (df['AMT_VAR_6M'].notna())
)

In [38]:
rule_flag_logic = ~(
    ((df['HAS_SHORTTERM_LOAN'] == 1) & (df['CBAL_SHORTTERM_LOAN'] <= 0)) |
    ((df['HAS_SHORTTERM_LOAN'] == 0) & (df['CBAL_SHORTTERM_LOAN'] > 0))
)

In [39]:
df['FLAG_OVERLIMIT'] = (df['CBAL'] > df['AFLIMT_MAX']).astype(int)
print(f"Số lượng vi phạm: {df['FLAG_OVERLIMIT'].sum()}")

Số lượng vi phạm: 0


Flag tổng hợp consistency

In [40]:
df['FLAG_INCONSISTENT'] = ~(
    rule_cons_balance &
    rule_cons_deposit &
    rule_cons_dpd &
    rule_struct_missing &
    rule_flag_logic
)

BẢNG TỔNG HỢP (RẤT NÊN CÓ)

In [41]:
quality_summary = pd.DataFrame({
    'Metric': [
        'Completeness',
        'Uniqueness',
        'Timeliness',
        'Validity',
        'Accuracy',
        'Consistency'
    ],
    'Issue_Rate_%': [
        df['FLAG_HIGH_MISSING'].mean() * 100,
        dup_full.mean() * 100,
        leak_check.shape[0] / len(df) * 100,
        df['FLAG_INVALID'].mean() * 100,
        df['FLAG_INACCURATE'].mean() * 100,
        df['FLAG_INCONSISTENT'].mean() * 100
    ]
})

quality_summary

Unnamed: 0,Metric,Issue_Rate_%
0,Completeness,0.0
1,Uniqueness,0.0
2,Timeliness,0.0
3,Validity,0.0
4,Accuracy,5.0
5,Consistency,0.0


In [42]:
df['FLAG_AGE_INVALID'] = (~rule_acc_age).astype(int)
df['FLAG_LTV_HIGH']    = (~rule_acc_ltv).astype(int)
df['FLAG_DTI_HIGH']    = (~rule_acc_dti).astype(int)
df['FLAG_CIC_LOW']     = (~rule_acc_cic).astype(int)
df['FLAG_INCONSISTENT_DEPOSIT'] = (~rule_cons_deposit).astype(int)

In [44]:
flag_cols = [c for c in df.columns if c.startswith('FLAG_')]

dq_flag_impact = []

for c in flag_cols:
    tmp = df.groupby(c)['BAD_NEXT_12M'].mean()
    dq_flag_impact.append({
        'FLAG': c,
        'BAD_RATE_0': tmp.get(0, np.nan),
        'BAD_RATE_1': tmp.get(1, np.nan),
        'LIFT': tmp.get(1, np.nan) / tmp.get(0, np.nan)
    })

dq_flag_impact = pd.DataFrame(dq_flag_impact).sort_values('LIFT', ascending=False)
dq_flag_impact

Unnamed: 0,FLAG,BAD_RATE_0,BAD_RATE_1,LIFT
2,FLAG_CASH_ADVANCE,0.106665,0.239615,2.246427
5,FLAG_INACCURATE,0.130827,0.13105,1.001703
0,FLAG_SALARY_ACC,0.269634,0.108701,0.403143
1,FLAG_DEPOSIT,0.246649,0.034673,0.140575
3,FLAG_HIGH_MISSING,0.130838,,
4,FLAG_INVALID,0.130838,,
6,FLAG_OVERLIMIT,0.130838,,
7,FLAG_INCONSISTENT,0.130838,,
8,FLAG_AGE_INVALID,0.130838,,
9,FLAG_LTV_HIGH,0.130838,,


In [45]:
for f in ['FLAG_HIGH_MISSING', 'FLAG_INCONSISTENT']:
    print(f, df[f].value_counts(dropna=False))


FLAG_HIGH_MISSING FLAG_HIGH_MISSING
0    2400000
Name: count, dtype: int64
FLAG_INCONSISTENT FLAG_INCONSISTENT
False    2400000
Name: count, dtype: int64


In [46]:
dqa_cols_to_drop = [
    'FLAG_INVALID', 'FLAG_AGE_INVALID', 'FLAG_LTV_HIGH', 
    'FLAG_INCONSISTENT_DEPOSIT', 'FLAG_INCONSISTENT_DEPOSIT' 
    # Và các cột FLAG tạm khác nếu có
]
df.drop(columns=[c for c in dqa_cols_to_drop if c in df.columns], inplace=True)

In [47]:
print("7. Lọc bỏ hồ sơ rác & Đã xấu (Population Filter)...")

cond_remove = (
    (df['BAD_CURRENT'] == 1) |      # Loại người đang Bad
    (df['MAX_NHOMNOCIC'] >= 3) |    # Loại nhóm nợ xấu
    (df['XULYNO'] == 1) |           # Loại đã xử lý nợ
    (df['TUOI'] < 18) |   
    (df['TUOI'] > 80) |
    (df['DTI_RATIO'] > 100) |                            # Loại trẻ em
    (df['CBAL'] <= 100000) |             # Loại không có dư nợ (Inactive)
    (df['INCOME'] <= 0)
)

df = df[~cond_remove].copy()

7. Lọc bỏ hồ sơ rác & Đã xấu (Population Filter)...


In [48]:
df.drop(['FLAG_DTI_HIGH', 'FLAG_INACCURATE', 'FLAG_CIC_LOW', 'FLAG_HIGH_MISSING', 'FLAG_INCONSISTENT', 'ROW_MISSING_PCT', 'SOCIF', 'SNAPSHOT_DATE', 'REF_MONTH', 'REF_DAY', 'BAD_CURRENT', 'XULYNO', 'MAX_NHOMNOCIC', 'INHERENT_RISK', 'CURRENT_RISK', 'year', 'HAS_LONGTERM_LOAN', 'HAS_SHORTTERM_LOAN', 'FLAG_OVERLIMIT'], axis=1, inplace=True)

In [49]:
df.shape

(1740237, 56)

In [50]:
df.columns

Index(['C_GIOITINH', 'TRINHDO', 'TTHONNHAN', 'SOHUUNHA', 'NHANVIENBIDV',
       'BASE_AUM', 'TUOI', 'INCOME', 'CBAL', 'CBALORG', 'AFLIMT_MAX',
       'AFLIMT_MIN', 'AFLIMT_AVG', 'CBAL_AVG', 'CBAL_MAX', 'CBAL_MIN',
       'COLLATERAL_VALUE', 'LTV', 'N_AVG_DEPOSIT_12M', 'N_AVG_DEPOSIT_6M',
       'N_AVG_DD_12M', 'N_AVG_CD_12M', 'FLAG_SALARY_ACC', 'FLAG_DEPOSIT',
       'UTILIZATION_RATE', 'CNT_CREDIT_CARDS', 'AMT_CASH_ADVANCE_12M',
       'FLAG_CASH_ADVANCE', 'PCT_PAYMENT_TO_BALANCE', 'CNT_MIN_PAY_6M',
       'AVG_DAYS_PAST_DUE', 'DTI_RATIO', 'PTI_RATIO', 'MOB',
       'CNT_OTHER_PRODUCTS', 'LIMIT_TO_INCOME', 'AMT_VAR_6M',
       'CBAL_SHORTTERM_LOAN', 'CBAL_LONGTERM_LOAN', 'CNT_DPD_30PLUS_6M',
       'OCCUPATION_TYPE', 'DURATION_MAX', 'REMAINING_DURATION_MAX',
       'TIME_TO_OP_MAX', 'RATE_AVG', 'PURCOD_MAX', 'PURCOD_MIN', 'MAX_DPD_12M',
       'MAX_DPD_12M_OBS', 'AVG_OD_DPD_12M', 'SUM_ALL_OD_12M',
       'N_AVG_OVERDUE_CBAL_12M', 'CBAL_TO_INC_12MON', 'REAL_GDP_GROWTH_12M',
       'BAD

In [51]:
target_col = 'BAD_NEXT_12M'
remove_cols = [target_col, 'SAMPLE_TYPE'] # Loại bỏ Target và nhãn chia tập khỏi X

# 2. Lọc dữ liệu theo từng tập
# TRAIN
mask_train = df['SAMPLE_TYPE'] == 'TRAIN'
train = df[mask_train].drop(columns='SAMPLE_TYPE')

# OOS
# mask_oos = df['SAMPLE_TYPE'] == 'OOS'
# X_oos = df[mask_oos].drop(columns=remove_cols)
# y_oos = df.loc[mask_oos, target_col]
mask_train = df['SAMPLE_TYPE'] == 'OOS'
oos = df[mask_train].drop(columns='SAMPLE_TYPE')

# OOT
# mask_oot = df['SAMPLE_TYPE'] == 'OOT'
# X_oot = df[mask_oot].drop(columns=remove_cols)
# y_oot = df.loc[mask_oot, target_col
mask_train = df['SAMPLE_TYPE'] == 'OOT'
oot = df[mask_train].drop(columns='SAMPLE_TYPE')

# 3. Kiểm tra kích thước (Shape Check) để đảm bảo không bị lệch dòng
print(f"TRAIN: {train.shape}")
print(f"OOS: {oos.shape}")
print(f"OOT: {oot.shape}")

# 4. Xuất ra file Parquet
# Lưu ý: X lưu dạng DataFrame, y lưu dạng Series (hoặc DataFrame 1 cột để an toàn index)
train.to_parquet('train.parquet', index=False)
oos.to_parquet('oos.parquet', index=False)
oot.to_parquet('oot.parquet', index=False)

# X_oos.to_parquet('X_oos.parquet', index=False)
# y_oos.to_frame().to_parquet('y_oos.parquet', index=False)

# X_oot.to_parquet('X_oot.parquet', index=False)
# y_oot.to_frame().to_parquet('y_oot.parquet', index=False)

TRAIN: (1137807, 55)
OOS: (300317, 55)
OOT: (302113, 55)
