In [61]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
pd.set_option('display.max_columns', None)

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# ==========================================
# 1. CẤU HÌNH & KHỞI TẠO
# ==========================================
NUM_CUSTOMERS = 2000000  # Số lượng khách hàng unique
YEARS = [2018, 2019, 2020, 2021, 2022, 2023]
np.random.seed(42)

print("1. Đang tạo hồ sơ tĩnh (Static Profile)...")
# --- 1.1 Tạo Tuổi & Trình độ ---
age_array = np.random.normal(35, 10, NUM_CUSTOMERS).astype(int).clip(18, 65)

# Vector hóa logic Trình độ (Nhanh hơn hàm if/else thông thường)
# Logic: <22: Cấp 3/ĐH; 22-25: ĐH; >25: Đa dạng
cond_edu = [age_array < 22, age_array <= 25]
choice_edu = [
    np.random.choice([1, 2], NUM_CUSTOMERS, p=[0.9, 0.1]),      # < 22
    np.random.choice([1, 2, 3], NUM_CUSTOMERS, p=[0.3, 0.6, 0.1]) # 22-25
]
# Mặc định (cho > 25 tuổi)
default_edu = np.random.choice([1, 2, 3, 4], NUM_CUSTOMERS, p=[0.3, 0.5, 0.15, 0.05])
edu_array = np.select(cond_edu, choice_edu, default=default_edu)

# --- 1.2 Tạo Tài sản (AUM) & Phân khúc (Binning) ---
# Tạo tài sản giả lập (Log-normal để có phân phối lệch phải: nhiều người nghèo, ít người giàu)
# Mean 17 ~ 24 triệu, Sigma 1.5 tạo độ lệch chuẩn lớn để có người vài tỷ
base_aum = np.random.lognormal(17, 2.0, NUM_CUSTOMERS).astype(np.int64)

# BINNING: Chia phân khúc dựa trên tài sản vừa tạo
# < 100 Triệu: Mass
# 100 Triệu - 1 Tỷ: Upper
# > 1 Tỷ: Private
cond_segment = [
    base_aum < 100_000_000,
    base_aum < 1_000_000_000
]
choice_segment = ['Mass', 'Upper']
segment_array = np.select(cond_segment, choice_segment, default='Private')

# --- 1.3 Đóng gói DataFrame ---
customers = pd.DataFrame({
    'SOCIF': np.arange(1000000, 1000000 + NUM_CUSTOMERS),
    'C_GIOITINH': np.random.choice(['M', 'F', 'O'], NUM_CUSTOMERS, p=[0.49, 0.49, 0.02]),
    
    'BASE_AGE': age_array,
    'TRINHDO': edu_array,
    
    'BASE_AUM': base_aum,              # <--- Cột tài sản gốc (để truy nguyên)
    'final_CST_MKT_SEG': segment_array, # <--- Cột phân khúc (Đã binning chuẩn logic)
    
    'TTHONNHAN': np.random.choice(['Single', 'Married'], NUM_CUSTOMERS),
    'SOHUUNHA': np.random.choice([0, 1], NUM_CUSTOMERS),
    'NHANVIENBIDV': np.random.choice([0, 1], NUM_CUSTOMERS, p=[0.99, 0.01]),
    'INHERENT_RISK_SCORE': np.random.normal(0, 1, NUM_CUSTOMERS)
})

# ==========================================
# 2. MỞ RỘNG DỮ LIỆU THEO NĂM (PANEL DATA)
# ==========================================
print("2. Expand dữ liệu theo năm...")
df_list = []
for year in YEARS:
    temp_df = customers.copy()
    temp_df['year'] = year
    # Mô phỏng Churn: Giữ lại 85% khách hàng mỗi năm
    temp_df = temp_df.sample(frac=0.85, random_state=year)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)
df = df.sort_values(by=['SOCIF', 'year'])
df['TUOI'] = df['BASE_AGE'] + (df['year'] - 2018)

# ==========================================
# 3. LOGIC CHỌN NGÀY & BAD FLAG
# ==========================================
print("3. Tính toán BAD & Chọn ngày quan sát ưu tiên...")

# 3.1 Xác định BAD
def calculate_bad_flag(row):
    risk = row['INHERENT_RISK_SCORE']
    if row['year'] in [2020, 2021]: risk -= 0.5 
    prob = 1 / (1 + np.exp(risk + 2)) 
    return 1 if np.random.rand() < prob else 0

df['BAD'] = df.apply(calculate_bad_flag, axis=1)

# 3.2 Chọn ngày: Bad -> Đầu năm, Good -> Cuối năm
def generate_priority_date(row):
    year = row['year']
    if row['BAD'] == 1:
        start, end = datetime(year, 1, 1), datetime(year, 6, 30)
    else:
        start, end = datetime(year, 10, 1), datetime(year, 12, 31)
    return start + timedelta(days=random.randrange((end - start).days))

df['datadate'] = df.apply(generate_priority_date, axis=1)

# ==========================================
# 4. SINH BIẾN TÀI CHÍNH CHI TIẾT
# ==========================================
print("4. Sinh biến tài chính chi tiết (Rolling 3M-12M)...")
N = len(df)
risk_factor = df['INHERENT_RISK_SCORE']

# --- 4.1 Dư nợ (CBAL) & Hạn mức ---
base_loan = np.random.lognormal(18, 1, N)
df['CBAL'] = (base_loan * (1 - risk_factor * 0.2)).astype(int).clip(0) 

# [FIX LOGIC] Tạo nhóm khách hàng không có dư nợ (Zero Balance) ~ 20%
# Chỉ áp dụng cho khách hàng Good (Bad bắt buộc phải có nợ/thấu chi)
mask_zero_debt = (df['BAD'] == 0) & (np.random.rand(N) < 0.2) 
df.loc[mask_zero_debt, 'CBAL'] = 0

# Logic CBALORG: 
# Nếu CBAL > 0: Dư nợ gốc >= Hiện tại
# Nếu CBAL == 0: Dư nợ gốc là lịch sử (vẫn có giá trị)
df['CBALORG'] = np.where(
    df['CBAL'] > 0,
    (df['CBAL'] * np.random.uniform(1.0, 1.5, N)).astype(int), 
    np.random.lognormal(18, 1, N).astype(int) 
)

df['CBAL_AVG'] = (df['CBAL'] * np.random.uniform(0.8, 1.2, N)).astype(int)
df['CBAL_MAX'] = (df['CBAL_AVG'] * 1.3).astype(int)
df['CBAL_MIN'] = (df['CBAL_AVG'] * 0.7).astype(int)

# Hạn mức (AFLIMT) luôn phải có
df['AFLIMT_MAX'] = (df['CBALORG'] * np.random.uniform(1.1, 2.5, N)).astype(int)
df['AFLIMT_MIN'] = (df['AFLIMT_MAX'] * 0.9).astype(int)
df['AFLIMT_AVG'] = (df['AFLIMT_MAX'] + df['AFLIMT_MIN']) // 2
df['AFLIMT_SUM'] = df['AFLIMT_MAX']

# --- 4.2 Khoản vay & Thời hạn ---
df['DURATION_MAX'] = np.random.choice([12, 24, 36, 60], N)
df['DURATION_AVG'] = df['DURATION_MAX']
df['REMAINING_DURATION_MAX'] = (df['DURATION_MAX'] * np.random.uniform(0.1, 0.9, N)).astype(int)
df['TIME_TO_OP_MAX'] = df['DURATION_MAX'] - df['REMAINING_DURATION_MAX']
df['RATE_AVG'] = np.random.normal(9.5, 2.0, N).clip(5, 20)

# --- 4.3 Thu nhập & LTV ---
df['INCOME'] = np.random.lognormal(16.5, 0.8, N).astype(int)
df['LTV'] = (df['CBAL'] / (df['INCOME'] * 50 * np.random.uniform(0.8, 1.2, N))) * 100
df['LTV'] = df['LTV'].clip(0, 150)

# --- 4.4 Tiền gửi (DEPOSIT) ---
base_deposit = np.random.lognormal(16, 2, N)
df['N_AVG_DEPOSIT_12M'] = (base_deposit * (1 + risk_factor * 0.5)).astype(int).clip(0)

# Ép phân khúc: Bad thường có tiền gửi < 5M
df['N_AVG_DEPOSIT_12M'] = np.where(df['BAD'] == 1, 
                                   np.random.uniform(0, 5000000, N).astype(int), 
                                   df['N_AVG_DEPOSIT_12M'])

# Sinh biến Rolling (3M, 6M, 9M)
df['N_AVG_DEPOSIT_3M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.8, 1.2, N)).astype(int)
df['N_AVG_DEPOSIT_6M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.9, 1.1, N)).astype(int)
df['N_AVG_DEPOSIT_9M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.95, 1.05, N)).astype(int)

df['N_AVG_DD_12M'] = (df['N_AVG_DEPOSIT_12M'] * 0.3).astype(int) # Casa
df['N_AVG_CD_12M'] = (df['N_AVG_DEPOSIT_12M'] * 0.7).astype(int) # Tiết kiệm
df['FLAG_DEPOSIT'] = np.where(df['N_AVG_DEPOSIT_12M'] > 500000, 1, 0)
df['FLAG_SALARY_ACC'] = np.random.choice([0, 1], N)

# ==========================================
# 5. BIẾN HÀNH VI & NỢ XẤU CHI TIẾT
# ==========================================
print("5. Mapping chi tiết nợ quá hạn (Overdue Details)...")

def assign_bad_behavior(row):
    if row['BAD'] == 1:
        choice = np.random.choice(['DPD_HIGH', 'CIC_BAD'], p=[0.7, 0.3])
        if choice == 'DPD_HIGH': return np.random.randint(91, 360), 1
        else: return np.random.randint(0, 89), np.random.choice([3, 4, 5])
    else:
        return np.random.randint(0, 30), np.random.choice([1, 1, 2], p=[0.9, 0.05, 0.05])

behavior = df.apply(assign_bad_behavior, axis=1, result_type='expand')
df['MAX_DPD_12M_FWD'] = behavior[0]
df['MAX_NHOMNOCIC'] = behavior[1]

df['SUM_ALL_OD_12M'] = df['MAX_DPD_12M_FWD'] * np.random.randint(1, 3, N)
df['MAX_AFCPNO_OD_12M'] = df['MAX_DPD_12M_FWD'] 
df['MAX_NHOMNOCUOI'] = df['MAX_NHOMNOCIC']
df['XULYNO'] = np.where(df['MAX_DPD_12M_FWD'] > 360, 1, 0)

df['N_AVG_OVERDUE_CBAL_12M'] = np.where(df['MAX_DPD_12M_FWD'] > 0, df['CBAL'] * np.random.uniform(0.1, 1.0, N), 0).astype(int)
df['N_MAX_OVERDUE_CBAL_12M'] = df['N_AVG_OVERDUE_CBAL_12M']

# ==========================================
# 6. BIẾN VĨ MÔ (MACRO)
# ==========================================
print("6. Ghép nối dữ liệu vĩ mô (Macroeconomics)...")
macro_data = {
    2018: {'GDP': 7.08, 'CPI': 3.54, 'UR': 2.19, 'IIP': 10},
    2019: {'GDP': 7.02, 'CPI': 2.79, 'UR': 2.17, 'IIP': 9},
    2020: {'GDP': 2.91, 'CPI': 3.23, 'UR': 2.48, 'IIP': 3},
    2021: {'GDP': 2.58, 'CPI': 1.84, 'UR': 3.22, 'IIP': 4},
    2022: {'GDP': 8.02, 'CPI': 3.15, 'UR': 2.32, 'IIP': 8},
    2023: {'GDP': 5.05, 'CPI': 3.25, 'UR': 2.28, 'IIP': 5},
}
def get_macro(year, ind): return macro_data.get(year, {}).get(ind, 0)

for ind in ['GDP', 'CPI', 'UR', 'IIP']:
    col = f'REAL_{ind}' if ind == 'GDP' else ind
    df[col] = df['year'].apply(lambda y: get_macro(y, ind))
    df[f'{col}_GROWTH_12M'] = df[col] * np.random.uniform(0.9, 1.1, N)

# ==========================================
# 7. SINH CHI TIẾT CẤU TRÚC NỢ (ĐÃ CLEAN CODE)
# ==========================================
print("7. Đang sinh chi tiết cấu trúc nợ & Tỷ lệ tài chính...")

# --- 7.1 Cấu trúc kỳ hạn ---
w_short = np.random.uniform(0, 1, N)
w_mid = np.random.uniform(0, 1, N)
w_long = np.random.uniform(0, 1, N)
total_w = w_short + w_mid + w_long
w_short /= total_w
w_mid /= total_w
w_long /= total_w

df['CBAL_SHORTTERM_LOAN'] = (df['CBAL'] * w_short).astype(int)
df['CBAL_MIDTERM_LOAN'] = (df['CBAL'] * w_mid).astype(int)
df['CBAL_LONGTERM_LOAN'] = df['CBAL'] - df['CBAL_SHORTTERM_LOAN'] - df['CBAL_MIDTERM_LOAN']

# Clean logic: Nếu CBAL=0 thì các thành phần = 0
df.loc[df['CBAL'] == 0, ['CBAL_SHORTTERM_LOAN', 'CBAL_MIDTERM_LOAN', 'CBAL_LONGTERM_LOAN']] = 0
df['HAS_SHORTTERM_LOAN'] = np.where(df['CBAL_SHORTTERM_LOAN'] > 0, 1, 0)
df['HAS_MIDTERM_LOAN'] = np.where(df['CBAL_MIDTERM_LOAN'] > 0, 1, 0)
df['HAS_LONGTERM_LOAN'] = np.where(df['CBAL_LONGTERM_LOAN'] > 0, 1, 0)

# Nếu CBAL=0 thì Dư nợ quá hạn = 0
df.loc[df['CBAL'] == 0, ['N_AVG_OVERDUE_CBAL_12M', 'N_MAX_OVERDUE_CBAL_12M']] = 0

# --- 7.2 Biến động LTV ---
df['MAX_LTV_MO'] = df['LTV'] * np.random.uniform(1.0, 1.1, N)
df['MIN_LTV_MO'] = df['LTV'] * np.random.uniform(0.8, 1.0, N)
df['AVG_LTV_MO'] = (df['MAX_LTV_MO'] + df['MIN_LTV_MO']) / 2

# --- 7.3 Tỷ lệ DTI ---
df['CBAL_TO_INC_12MON'] = df['CBAL'] / (df['INCOME'] + 1)
df['CBAL_TO_INC_9MON'] = df['CBAL_TO_INC_12MON'] * np.random.uniform(0.95, 1.05, N)
df['CBAL_TO_INC_6MON'] = df['CBAL_TO_INC_12MON'] * np.random.uniform(0.9, 1.1, N)
df['CBAL_TO_INC_3MON'] = df['CBAL_TO_INC_12MON'] * np.random.uniform(0.85, 1.15, N)

# --- 7.4 Thông tin trả nợ ---
# Dùng CBAL_AVG để tính lãi (Logic chuẩn hơn CBAL hiện tại)
df['INTEREST_12M'] = (df['CBAL_AVG'] * (df['RATE_AVG'] / 100)).astype(int)
df['INTEREST'] = (df['INTEREST_12M'] / 12).astype(int)

df['N_PAYMENT_GOC'] = np.where(df['CBAL'] > 0, np.random.randint(1, 13, N), 0)
df['N_PAYMENT_LAI'] = np.where(df['CBAL'] > 0, np.random.randint(1, 13, N), 0)

df['PRINPICAL_PYMT_FRQ_ID_MAX'] = np.random.choice([1, 1, 1, 3, 6], N, p=[0.7, 0.1, 0.1, 0.05, 0.05])
df['INT_PYMT_FRQ_ID_MAX'] = df['PRINPICAL_PYMT_FRQ_ID_MAX']

# --- 7.5 Codes & Dirty Data Prep ---
df['PURCOD_MAX'] = np.random.randint(1, 10, N)
df['PURCOD_MIN'] = df['PURCOD_MAX']
df['CFORGD'] = np.random.choice(['C', 'D', 'O', np.nan], N, p=[0.4, 0.3, 0.1, 0.2]) 

# --- 7.6 CBALORG Variation ---
df['CBALORG_MAX'] = df['CBALORG']
df['CBALORG_MIN'] = (df['CBALORG'] * 0.9).astype(int)
df['CBALORG_AVG'] = (df['CBALORG_MAX'] + df['CBALORG_MIN']) // 2

# ==========================================
# 8. GÁN NHÃN & TẠO DIRTY DATA
# ==========================================
def assign_sample(year):
    if year == 2022: return 'OOT'
    if year == 2023: return 'VALIDATION'
    return 'TRAIN' if np.random.rand() < 0.8 else 'TEST'
df['SAMPLE_TYPE'] = df['year'].apply(assign_sample)

# --- INJECT DIRTY DATA (Cho bước Data Cleaning) ---
# 1. Missing Value > 40% (Đã gài ở biến CFORGD)
# 2. Tuổi < 15
df.loc[np.random.choice(df.index, 50), 'TUOI'] = 14 
# 3. LTV > 400% (Vô lý)
df.loc[np.random.choice(df.index, 30), 'LTV'] = 500
# 4. Kỳ hạn âm (Lỗi hệ thống)
df.loc[np.random.choice(df.index, 20), 'DURATION_MAX'] = -12

print("-" * 30)
print(f"XONG! Kích thước: {df.shape}")
print(f"Số cột: {len(df.columns)}")
print("Phân phối Sample:")
print(df['SAMPLE_TYPE'].value_counts())
print(f"\nSố lượng KH Zero Balance (Kiểm tra logic): {(df['CBAL'] == 0).sum()}")

# df.to_csv("BIDV_Scorecard_Final.csv", index=False)

1. Đang tạo hồ sơ tĩnh (Static Profile)...
2. Expand dữ liệu theo năm...
3. Tính toán BAD & Chọn ngày quan sát ưu tiên...
4. Sinh biến tài chính chi tiết (Rolling 3M-12M)...
5. Mapping chi tiết nợ quá hạn (Overdue Details)...
6. Ghép nối dữ liệu vĩ mô (Macroeconomics)...
7. Đang sinh chi tiết cấu trúc nợ & Tỷ lệ tài chính...
------------------------------
XONG! Kích thước: (10200000, 80)
Số cột: 80
Phân phối Sample:
SAMPLE_TYPE
TRAIN         5442449
OOT           1700000
VALIDATION    1700000
TEST          1357551
Name: count, dtype: int64

Số lượng KH Zero Balance (Kiểm tra logic): 1681056


In [63]:
pd.set_option('display.max_columns', None)

In [64]:
df

Unnamed: 0,SOCIF,C_GIOITINH,BASE_AGE,TRINHDO,TTHONNHAN,SOHUUNHA,NHANVIENBIDV,PDKH,final_CST_MKT_SEG,INHERENT_RISK_SCORE,year,TUOI,BAD,datadate,CBAL,CBALORG,CBAL_AVG,CBAL_MAX,CBAL_MIN,AFLIMT_MAX,AFLIMT_MIN,AFLIMT_AVG,AFLIMT_SUM,DURATION_MAX,DURATION_AVG,REMAINING_DURATION_MAX,TIME_TO_OP_MAX,RATE_AVG,INCOME,LTV,N_AVG_DEPOSIT_12M,N_AVG_DEPOSIT_3M,N_AVG_DEPOSIT_6M,N_AVG_DEPOSIT_9M,N_AVG_DD_12M,N_AVG_CD_12M,FLAG_DEPOSIT,FLAG_SALARY_ACC,MAX_DPD_12M_FWD,MAX_NHOMNOCIC,SUM_ALL_OD_12M,MAX_AFCPNO_OD_12M,MAX_NHOMNOCUOI,XULYNO,N_AVG_OVERDUE_CBAL_12M,N_MAX_OVERDUE_CBAL_12M,REAL_GDP,REAL_GDP_GROWTH_12M,CPI,CPI_GROWTH_12M,UR,UR_GROWTH_12M,IIP,IIP_GROWTH_12M,CBAL_SHORTTERM_LOAN,CBAL_MIDTERM_LOAN,CBAL_LONGTERM_LOAN,HAS_SHORTTERM_LOAN,HAS_MIDTERM_LOAN,HAS_LONGTERM_LOAN,MAX_LTV_MO,MIN_LTV_MO,AVG_LTV_MO,CBAL_TO_INC_12MON,CBAL_TO_INC_9MON,CBAL_TO_INC_6MON,CBAL_TO_INC_3MON,INTEREST_12M,INTEREST,N_PAYMENT_GOC,N_PAYMENT_LAI,PRINPICAL_PYMT_FRQ_ID_MAX,INT_PYMT_FRQ_ID_MAX,PURCOD_MAX,PURCOD_MIN,CFORGD,CBALORG_MAX,CBALORG_MIN,CBALORG_AVG,SAMPLE_TYPE
144553,1000000,M,31,3,Married,0,0,CN,Upper,0.081665,2018,31,0,2018-10-19,0,55219722,0,0,0,63145415,56830873,59988144,63145415,12,12,9,3,10.431875,7524906,0.000000,298291283,266899249,321985433,299834155,89487384,208803898,1,0,6,1,6,6,1,0,0,0,7.08,7.428960,3.54,3.329762,2.19,2.364794,10,10.959149,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,1,1,2,2,C,55219722,49697749,52458735,TRAIN
3262557,1000000,M,31,3,Married,0,0,CN,Upper,0.081665,2019,32,0,2019-10-02,0,15776359,0,0,0,37029672,33326704,35178188,37029672,24,24,7,17,9.780467,6544883,0.000000,40365699,35691757,41114917,41874922,12109709,28255989,1,1,17,1,17,17,1,0,0,0,7.02,7.119873,2.79,2.681025,2.17,2.305719,9,8.365479,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,1,1,6,6,D,15776359,14198723,14987541,TRAIN
4865346,1000000,M,31,3,Married,0,0,CN,Upper,0.081665,2020,33,0,2020-12-12,0,32555056,0,0,0,52946238,47651614,50298926,52946238,60,60,9,51,13.141884,9304234,0.000000,3716132,4382463,3738954,3726612,1114839,2601292,1,0,0,1,0,0,1,0,0,0,2.91,2.700748,3.23,3.532777,2.48,2.506978,3,3.204949,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,3,3,9,9,O,32555056,29299550,30927303,TEST
5798457,1000000,M,31,3,Married,0,0,CN,Upper,0.081665,2021,34,0,2021-10-31,37611465,49002406,40660674,52858876,28462471,62691375,56422237,59556806,62691375,12,12,5,7,11.886971,34059038,2.566387,457135981,459858063,475208856,469389332,137140794,319995186,1,0,14,1,14,14,1,0,18835324,18835324,2.58,2.366883,1.84,1.833995,3.22,3.082874,4,3.945025,12883581,12476682,12251202,1,1,1,2.626571,2.204192,2.415381,1.104302,1.109412,1.148937,1.070245,4833322,402776,6,4,1,1,2,2,C,49002406,44102165,46552285,TRAIN
7859992,1000000,M,31,3,Married,0,0,CN,Upper,0.081665,2022,35,0,2022-10-21,258073574,373398538,309336865,402137924,216535805,574998970,517499073,546249021,574998970,36,36,10,26,5.540016,17600743,25.932672,1036463,870526,1009827,1082301,310938,725524,1,1,20,1,20,20,1,0,170316877,170316877,8.02,7.282516,3.15,3.366488,2.32,2.541098,8,7.512059,54634428,55284338,148154808,1,1,1,27.196893,22.057096,24.626994,14.662651,14.336036,14.344643,13.183018,17137312,1428109,4,11,1,1,4,4,,373398538,336058684,354728611,OOT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1781386,2999999,F,38,3,Single,1,0,CN,Private,-1.253752,2019,39,0,2019-10-08,24580672,32589657,23124293,30061580,16187005,45751600,41176440,43464020,45751600,60,60,32,28,8.564505,7829932,7.319840,2769039,2575947,2630717,2655134,830711,1938327,1,1,28,1,56,28,1,0,18669501,18669501,7.02,6.643539,2.79,3.020081,2.17,2.118459,9,9.375530,14404712,1401515,8774445,1,1,1,7.745565,6.778544,7.262055,3.139321,3.014722,3.267232,3.356087,1980481,165040,5,5,1,1,2,2,,32589657,29330691,30960174,TRAIN
3748237,2999999,F,38,3,Single,1,0,CN,Private,-1.253752,2020,40,0,2020-10-09,0,18583637,0,0,0,41414289,37272860,39343574,41414289,24,24,11,13,13.068060,4579704,0.000000,1609305,1580081,1564826,1645948,482791,1126513,1,0,8,1,8,8,1,0,0,0,2.91,3.078375,3.23,2.929928,2.48,2.610878,3,3.014897,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,1,1,6,6,C,18583637,16725273,17654455,TRAIN
5943327,2999999,F,38,3,Single,1,0,CN,Private,-1.253752,2021,41,0,2021-11-24,147481413,204317229,136620772,177607003,95634540,457828239,412045415,434936827,457828239,12,12,1,11,8.980053,7746405,39.420161,1612211,1905591,1686931,1555198,483663,1128547,1,1,12,2,12,12,2,0,77586230,77586230,2.58,2.595747,1.84,2.017216,3.22,3.045694,4,4.349622,70322750,60847114,16311549,1,1,1,40.264077,34.974076,37.619076,19.038689,18.725013,19.563757,19.675374,12268617,1022384,11,8,1,1,9,9,O,204317229,183885506,194101367,TRAIN
8115486,2999999,F,38,3,Single,1,0,CN,Private,-1.253752,2022,42,0,2022-10-03,0,65821593,0,0,0,132038643,118834778,125436710,132038643,24,24,8,16,9.632058,43477223,0.000000,688966,715662,627531,663630,206689,482276,1,0,21,2,42,21,2,0,0,0,8.02,7.690660,3.15,3.447838,2.32,2.449861,8,8.621236,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,1,1,6,6,,65821593,59239433,62530513,OOT


In [65]:
df['INTEREST_12M'].min()

0

In [66]:
df['INTEREST_12M'].max()

1153223156

In [68]:
df.to_csv("BIDV_Scorecard_Final.csv", index=False)

In [69]:
leak_cols = [
    'BAD',               # Biến mục tiêu (đương nhiên phải tách ra làm y)
    'MAX_DPD_12M_FWD',   # Tương lai
    'MAX_NHOMNOCIC',     # Tương lai (trong code này)
    'XULYNO',            # Tương lai
    'SUM_ALL_OD_12M',    # Tương lai (trong code này)
    'MAX_AFCPNO_OD_12M', # Tương lai (trong code này)
    'MAX_NHOMNOCUOI',    # Tương lai (trong code này)
    'N_AVG_OVERDUE_CBAL_12M', # Tương lai (trong code này)
    'N_MAX_OVERDUE_CBAL_12M'  # Tương lai (trong code này)
]

# Tạo tập Features (X)
X = df.drop(columns=leak_cols + ['SOCIF', 'datadate', 'SAMPLE_TYPE']) 

# Tạo tập Target (y)
y = df['BAD']