In [13]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [14]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# ==========================================
# 1. CẤU HÌNH & KHỞI TẠO
# ==========================================
NUM_CUSTOMERS = 400000  # Số lượng khách hàng unique
YEARS = [2018, 2019, 2020, 2021, 2022, 2023]
np.random.seed(42)

print("1. Đang tạo hồ sơ tĩnh (Static Profile)...")
# --- 1.1 Tạo Tuổi & Trình độ ---
age_array = np.random.normal(35, 10, NUM_CUSTOMERS).astype(int)

cond_edu = [age_array < 22, age_array <= 25]
choice_edu = [
    np.random.choice([1, 2], NUM_CUSTOMERS, p=[0.9, 0.1]),       # < 22
    np.random.choice([1, 2, 3], NUM_CUSTOMERS, p=[0.3, 0.6, 0.1]) # 22-25
]
default_edu = np.random.choice([1, 2, 3, 4], NUM_CUSTOMERS, p=[0.3, 0.5, 0.15, 0.05])
edu_array = np.select(cond_edu, choice_edu, default=default_edu)

# --- 1.2 Tạo Tài sản gốc (Initial AUM) ---
base_aum_init = np.random.lognormal(17, 2.0, NUM_CUSTOMERS).astype(np.int64)

# --- [PATCH 4] Logic Sở hữu nhà theo Tuổi ---
# Tuổi càng cao -> Xác suất có nhà càng lớn
prob_house = (age_array - 18) / 50 
prob_house = np.clip(prob_house, 0.05, 0.95)
house_ownership = (np.random.rand(NUM_CUSTOMERS) < prob_house).astype(int)

# --- 1.3 Đóng gói DataFrame ---
customers = pd.DataFrame({
    'SOCIF': np.arange(1000000, 1000000 + NUM_CUSTOMERS),
    'C_GIOITINH': np.random.choice(['M', 'F', 'O'], NUM_CUSTOMERS, p=[0.49, 0.49, 0.02]),
    
    'BASE_AGE': age_array,
    'TRINHDO': edu_array,
    'INITIAL_AUM': base_aum_init,
    
    'TTHONNHAN': np.random.choice(['Single', 'Married'], NUM_CUSTOMERS),
    'SOHUUNHA': house_ownership, # <--- Đã dùng biến mới
    'NHANVIENBIDV': np.random.choice([0, 1], NUM_CUSTOMERS, p=[0.99, 0.01]),
    'INHERENT_RISK_SCORE': np.random.normal(0, 1, NUM_CUSTOMERS)
})

# ==========================================
# 2. MỞ RỘNG DỮ LIỆU THEO NĂM (RANDOM WALK)
# ==========================================
print("2. Expand dữ liệu theo năm (Có biến động tài sản)...")

# --- BƯỚC 2.1: TẠO MA TRẬN TĂNG TRƯỞNG ---
n_years = len(YEARS)
growth_matrix = np.random.normal(1.08, 0.15, (NUM_CUSTOMERS, n_years)) 
growth_matrix[:, 2] -= 0.15 # 2020
growth_matrix[:, 3] -= 0.05 # 2021
cum_growth_matrix = np.cumprod(growth_matrix, axis=1)

# --- BƯỚC 2.2: GÁN VÀO DATAFRAME ---
df_list = []
for i, year in enumerate(YEARS):
    temp_df = customers.copy()
    temp_df['year'] = year
    
    # Tính AUM động
    current_growth_factors = cum_growth_matrix[:, i]
    temp_df['BASE_AUM'] = (temp_df['INITIAL_AUM'] * current_growth_factors).astype(np.int64).clip(0)
    
    # Tính lại phân khúc
    cond_segment = [
        temp_df['BASE_AUM'] < 100_000_000,
        temp_df['BASE_AUM'] < 1_000_000_000
    ]
    choice_segment = ['Mass', 'Upper']
    temp_df['final_CST_MKT_SEG'] = np.select(cond_segment, choice_segment, default='Private')
    
    temp_df = temp_df.sample(frac=0.85, random_state=year)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)
df = df.sort_values(by=['SOCIF', 'year'])
df['TUOI'] = df['BASE_AGE'] + (df['year'] - 2018)
df.drop(columns=['INITIAL_AUM'], inplace=True)


# ==========================================
# 4. SINH BIẾN TÀI CHÍNH (RE-ORDERED & PATCHED)
# ==========================================
print("4. Sinh biến tài chính (Logic AUM -> INCOME -> CBAL)...")
N = len(df)
risk_factor = df['INHERENT_RISK_SCORE']

# --- [PATCH 2] BƯỚC 4.1: SINH INCOME TRƯỚC (Dựa trên AUM & Trình độ) ---
# Logic: Có tài sản thì mới sinh ra thu nhập
base_income_from_aum = np.log1p(df['BASE_AUM']) * 1_200_000 
edu_multiplier = df['TRINHDO'].map({1: 0.8, 2: 1.0, 3: 1.5, 4: 2.5})

df['INCOME'] = (base_income_from_aum * edu_multiplier * np.random.uniform(0.8, 1.5, N)).astype(int)
df['INCOME'] = df['INCOME'].clip(5_000_000, 500_000_000)

# --- BƯỚC 4.2: SINH DƯ NỢ (CBAL) DỰA TRÊN INCOME ---
# Logic: Nhu cầu vay = Khẩu vị rủi ro (Random) * Khả năng trả nợ (Income)
base_loan_propensity = np.random.lognormal(17.5, 0.8, N) 
income_scale = df['INCOME'] / df['INCOME'].median()

# Người risk cao vay nhiều hơn, người thu nhập cao vay nhiều hơn
risk_adj = np.clip(1 - risk_factor * 0.1, 0.5, 1.5)
df['CBAL'] = (base_loan_propensity * income_scale * risk_adj).astype(int)

# Zero Balance Logic
mask_zero_debt = (df['INCOME'] < df['INCOME'].median()) & (np.random.rand(N) < 0.25)
df.loc[mask_zero_debt, 'CBAL'] = 0


# CBALORG 
df['CBALORG'] = (df['CBAL'] * np.random.uniform(1.0, 1.2, N)).astype(int)

df.loc[df['CBAL'] == 0, 'CBALORG'] = (
    np.random.lognormal(13, 0.6, (df['CBAL'] == 0).sum()).astype(int)
)

# --- [PATCH 3] Logic Toán học Max/Min/Avg ---
df['CBAL_AVG'] = (df['CBAL'] * np.random.uniform(0.8, 1.2, N)).astype(int)

temp_max = (df['CBAL_AVG'] * np.random.uniform(1.1, 1.5, N)).astype(int)
# Fix lỗi Max < Current bằng np.maximum.reduce
df['CBAL_MAX'] = np.maximum.reduce([temp_max, df['CBAL'], df['CBAL_AVG']]) 

temp_min = (df['CBAL_AVG'] * np.random.uniform(0.5, 0.9, N)).astype(int)
# Fix lỗi Min > Current bằng np.minimum.reduce
df['CBAL_MIN'] = np.minimum.reduce([temp_min, df['CBAL'], df['CBAL_AVG']]) 

# Hạn mức
limit_basis = np.maximum(df['CBALORG'], df['INCOME'] * 5)
df['AFLIMT_MAX'] = (limit_basis * np.random.uniform(1.0, 2.0, N)).astype(int)
df['AFLIMT_MIN'] = (df['AFLIMT_MAX'] * 0.9).astype(int)
df['AFLIMT_AVG'] = (df['AFLIMT_MAX'] + df['AFLIMT_MIN']) // 2
df['AFLIMT_SUM'] = df['AFLIMT_MAX']

# --- BƯỚC 4.3: SINH CÁC BIẾN CÒN LẠI (LTV, DEPOSIT) ---
# LTV (Dùng Income và CBAL đã đồng bộ
collateral_value = (
    df['BASE_AUM'] * np.random.uniform(0.5, 1.5, N)
    + df['SOHUUNHA'] * np.random.uniform(1e9, 3e9, N)
)

df['COLLATERAL_VALUE'] = collateral_value.astype(int)

df['LTV'] = (df['CBAL'] / (df['COLLATERAL_VALUE'] + 1)) * 100
df['LTV'] = df['LTV'].clip(0, 150)


# [PATCH 1] Tiền gửi (DEPOSIT) tương quan với AUM
deposit_ratio = np.random.uniform(0.1, 0.9, N)
df['N_AVG_DEPOSIT_12M'] = (df['BASE_AUM'] * deposit_ratio).astype(int)

# Ép phân khúc Bad

df['N_AVG_DEPOSIT_3M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.8, 1.2, N)).astype(int)
df['N_AVG_DEPOSIT_6M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.9, 1.1, N)).astype(int)
df['N_AVG_DEPOSIT_9M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.95, 1.05, N)).astype(int)

df['N_AVG_DD_12M'] = (df['N_AVG_DEPOSIT_12M'] * 0.3).astype(int) 
df['N_AVG_CD_12M'] = (df['N_AVG_DEPOSIT_12M'] * 0.7).astype(int) 
df['FLAG_DEPOSIT'] = np.where(df['N_AVG_DEPOSIT_12M'] > 500000, 1, 0)
df['FLAG_SALARY_ACC'] = np.random.choice([0, 1], N)

# --- Các biến thời hạn & Lãi suất ---
df['DURATION_MAX'] = np.random.choice([12, 24, 36, 60], N)
df['DURATION_AVG'] = df['DURATION_MAX']
df['REMAINING_DURATION_MAX'] = (df['DURATION_MAX'] * np.random.uniform(0.1, 0.9, N)).astype(int)
df['TIME_TO_OP_MAX'] = df['DURATION_MAX'] - df['REMAINING_DURATION_MAX']
df['RATE_AVG'] = np.random.normal(9.5, 2.0, N).clip(5, 20)

# ==========================================
# 5. BIẾN HÀNH VI & NỢ XẤU CHI TIẾT
# ==========================================
print("5. Mapping chi tiết nợ quá hạn...")


# DPD base từ risk + macro
base_dpd = (
    np.maximum(0, np.random.normal(15, 20, N))
    + (df['INHERENT_RISK_SCORE'] > 1) * np.random.randint(30, 90, N)
    + (df['year'].isin([2020, 2021])) * np.random.randint(10, 40, N)
)

df['MAX_DPD_12M'] = base_dpd.clip(0, 360).astype(int)
df['MAX_DPD_12M_OBS'] = (
    df['MAX_DPD_12M'] * np.random.uniform(0.8, 1.0, N)
).astype(int)
def cic_from_dpd(dpd):
    if dpd >= 180: return np.random.choice([4, 5])
    if dpd >= 90:  return 3
    if dpd >= 30:  return 2
    return 1

df['MAX_NHOMNOCIC'] = df['MAX_DPD_12M'].apply(cic_from_dpd)
df['SUM_ALL_OD_12M'] = (
    df['MAX_DPD_12M_OBS'] * np.random.uniform(0.3, 0.7, N)
).astype(int)
df['MAX_AFCPNO_OD_12M'] = df['MAX_DPD_12M_OBS'] 
df['MAX_NHOMNOCUOI'] = df['MAX_NHOMNOCIC']
df['XULYNO'] = np.where(
    (df['MAX_DPD_12M_OBS'] >= 90) & (np.random.rand(N) < 0.2),
    1, 0
)


df['N_AVG_OVERDUE_CBAL_12M'] = np.where(df['MAX_DPD_12M_OBS'] > 0, df['CBAL'] * np.random.uniform(0.1, 1.0, N), 0).astype(int)
df['N_MAX_OVERDUE_CBAL_12M'] = df['N_AVG_OVERDUE_CBAL_12M']

df['BAD'] = np.where(df['MAX_DPD_12M_OBS'] >= 90, 1, 0)

# Penalize deposit sau khi đã có DPD (NO LEAK)
df.loc[df['MAX_DPD_12M_OBS'] >= 60, 'N_AVG_DEPOSIT_12M'] *= 0.5
df['FLAG_DEPOSIT'] = np.where(df['N_AVG_DEPOSIT_12M'] > 500_000, 1, 0)

# Re-calc các window khác cho consistency
df['N_AVG_DEPOSIT_3M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.8, 1.2, N)).astype(int)
df['N_AVG_DEPOSIT_6M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.9, 1.1, N)).astype(int)
df['N_AVG_DEPOSIT_9M'] = (df['N_AVG_DEPOSIT_12M'] * np.random.uniform(0.95, 1.05, N)).astype(int)
    
# ==========================================
# 6. BIẾN VĨ MÔ (MACRO)
# ==========================================
print("6. Ghép nối dữ liệu vĩ mô...")
macro_data = {
    2018: {'GDP': 7.08, 'CPI': 3.54, 'UR': 2.19, 'IIP': 10},
    2019: {'GDP': 7.02, 'CPI': 2.79, 'UR': 2.17, 'IIP': 9},
    2020: {'GDP': 2.91, 'CPI': 3.23, 'UR': 2.48, 'IIP': 3},
    2021: {'GDP': 2.58, 'CPI': 1.84, 'UR': 3.22, 'IIP': 4},
    2022: {'GDP': 8.02, 'CPI': 3.15, 'UR': 2.32, 'IIP': 8},
    2023: {'GDP': 5.05, 'CPI': 3.25, 'UR': 2.28, 'IIP': 5},
}
def get_macro(year, ind): return macro_data.get(year, {}).get(ind, 0)

for ind in ['GDP', 'CPI', 'UR', 'IIP']:
    col = f'REAL_{ind}' if ind == 'GDP' else ind
    df[col] = df['year'].apply(lambda y: get_macro(y, ind))
    df[f'{col}_GROWTH_12M'] = df[col] * np.random.uniform(0.9, 1.1, N)

# ==========================================
# 7. SINH CHI TIẾT CẤU TRÚC NỢ
# ==========================================
print("7. Đang sinh chi tiết cấu trúc nợ...")

# Cấu trúc kỳ hạn
w_short = np.random.uniform(0, 1, N)
w_mid = np.random.uniform(0, 1, N)
w_long = np.random.uniform(0, 1, N)
total_w = w_short + w_mid + w_long
w_short /= total_w
w_mid /= total_w
w_long /= total_w

df['CBAL_SHORTTERM_LOAN'] = (df['CBAL'] * w_short).astype(int)
df['CBAL_MIDTERM_LOAN'] = (df['CBAL'] * w_mid).astype(int)
df['CBAL_LONGTERM_LOAN'] = df['CBAL'] - df['CBAL_SHORTTERM_LOAN'] - df['CBAL_MIDTERM_LOAN']

df.loc[df['CBAL'] == 0, ['CBAL_SHORTTERM_LOAN', 'CBAL_MIDTERM_LOAN', 'CBAL_LONGTERM_LOAN']] = 0
df['HAS_SHORTTERM_LOAN'] = np.where(df['CBAL_SHORTTERM_LOAN'] > 0, 1, 0)
df['HAS_MIDTERM_LOAN'] = np.where(df['CBAL_MIDTERM_LOAN'] > 0, 1, 0)
df['HAS_LONGTERM_LOAN'] = np.where(df['CBAL_LONGTERM_LOAN'] > 0, 1, 0)

df.loc[df['CBAL'] == 0, ['N_AVG_OVERDUE_CBAL_12M', 'N_MAX_OVERDUE_CBAL_12M']] = 0

# Biến động LTV
df['MAX_LTV_MO'] = df['LTV'] * np.random.uniform(1.0, 1.1, N)
df['MIN_LTV_MO'] = df['LTV'] * np.random.uniform(0.8, 1.0, N)
df['AVG_LTV_MO'] = (df['MAX_LTV_MO'] + df['MIN_LTV_MO']) / 2

# Tỷ lệ DTI
df['CBAL_TO_INC_12MON'] = df['CBAL'] / (df['INCOME'] + 1)
df['CBAL_TO_INC_12MON'] = df['CBAL_TO_INC_12MON'].clip(0, 5)
df['CBAL_TO_INC_9MON'] = df['CBAL_TO_INC_12MON'] * np.random.uniform(0.95, 1.05, N)
df['CBAL_TO_INC_6MON'] = df['CBAL_TO_INC_12MON'] * np.random.uniform(0.9, 1.1, N)
df['CBAL_TO_INC_3MON'] = df['CBAL_TO_INC_12MON'] * np.random.uniform(0.85, 1.15, N)

# Thông tin trả nợ (Dùng CBAL_AVG để tính lãi)
df['INTEREST_12M'] = (df['CBAL_AVG'] * (df['RATE_AVG'] / 100)).astype(int)
df['INTEREST'] = (df['INTEREST_12M'] / 12).astype(int)

df['N_PAYMENT_GOC'] = np.where(df['CBAL'] > 0, np.random.randint(1, 13, N), 0)
df['N_PAYMENT_LAI'] = np.where(df['CBAL'] > 0, np.random.randint(1, 13, N), 0)

df['PRINPICAL_PYMT_FRQ_ID_MAX'] = np.random.choice([1, 1, 1, 3, 6], N, p=[0.7, 0.1, 0.1, 0.05, 0.05])
df['INT_PYMT_FRQ_ID_MAX'] = df['PRINPICAL_PYMT_FRQ_ID_MAX']

# Codes
df['PURCOD_MAX'] = np.random.randint(1, 10, N)
df['PURCOD_MIN'] = df['PURCOD_MAX']
df['CFORGD'] = np.random.choice(['C', 'D', 'O', np.nan], N, p=[0.4, 0.3, 0.1, 0.2]) 

# CBALORG Variation
df['CBALORG_MAX'] = df['CBALORG']
df['CBALORG_MIN'] = (df['CBALORG'] * 0.9).astype(int)
df['CBALORG_AVG'] = (df['CBALORG_MAX'] + df['CBALORG_MIN']) // 2

# ==========================================
# 8. GÁN NHÃN & TẠO DIRTY 
# ==========================================
def assign_sample(year):
    if year <= 2020: return 'TRAIN'
    if year == 2021: return 'OOS'
    if year >= 2022: return 'OOT'

df['SAMPLE_TYPE'] = df['year'].apply(assign_sample)

# Inject Dirty Data
df.loc[np.random.choice(df.index, 50), 'TUOI'] = 14 
df.loc[np.random.choice(df.index, 30), 'LTV'] = 500
df.loc[np.random.choice(df.index, 20), 'DURATION_MAX'] = -12

print("-" * 30)
print(f"XONG! Kích thước: {df.shape}")
print(f"Số cột: {len(df.columns)}")
print("Phân phối Sample:")
print(df['SAMPLE_TYPE'].value_counts())
print(f"\nSố lượng KH Zero Balance: {(df['CBAL'] == 0).sum()}")

1. Đang tạo hồ sơ tĩnh (Static Profile)...
2. Expand dữ liệu theo năm (Có biến động tài sản)...
4. Sinh biến tài chính (Logic AUM -> INCOME -> CBAL)...
5. Mapping chi tiết nợ quá hạn...


 1.01728720e+09 1.85580055e+07]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[df['MAX_DPD_12M_OBS'] >= 60, 'N_AVG_DEPOSIT_12M'] *= 0.5


6. Ghép nối dữ liệu vĩ mô...
7. Đang sinh chi tiết cấu trúc nợ...
------------------------------
XONG! Kích thước: (2040000, 81)
Số cột: 81
Phân phối Sample:
SAMPLE_TYPE
TRAIN    1020000
OOT       680000
OOS       340000
Name: count, dtype: int64

Số lượng KH Zero Balance: 255227


In [15]:
pd.set_option('display.max_columns', None)

In [16]:
df

Unnamed: 0,SOCIF,C_GIOITINH,BASE_AGE,TRINHDO,TTHONNHAN,SOHUUNHA,NHANVIENBIDV,INHERENT_RISK_SCORE,year,BASE_AUM,final_CST_MKT_SEG,TUOI,INCOME,CBAL,CBALORG,CBAL_AVG,CBAL_MAX,CBAL_MIN,AFLIMT_MAX,AFLIMT_MIN,AFLIMT_AVG,AFLIMT_SUM,COLLATERAL_VALUE,LTV,N_AVG_DEPOSIT_12M,N_AVG_DEPOSIT_3M,N_AVG_DEPOSIT_6M,N_AVG_DEPOSIT_9M,N_AVG_DD_12M,N_AVG_CD_12M,FLAG_DEPOSIT,FLAG_SALARY_ACC,DURATION_MAX,DURATION_AVG,REMAINING_DURATION_MAX,TIME_TO_OP_MAX,RATE_AVG,MAX_DPD_12M,MAX_DPD_12M_OBS,MAX_NHOMNOCIC,SUM_ALL_OD_12M,MAX_AFCPNO_OD_12M,MAX_NHOMNOCUOI,XULYNO,N_AVG_OVERDUE_CBAL_12M,N_MAX_OVERDUE_CBAL_12M,BAD,REAL_GDP,REAL_GDP_GROWTH_12M,CPI,CPI_GROWTH_12M,UR,UR_GROWTH_12M,IIP,IIP_GROWTH_12M,CBAL_SHORTTERM_LOAN,CBAL_MIDTERM_LOAN,CBAL_LONGTERM_LOAN,HAS_SHORTTERM_LOAN,HAS_MIDTERM_LOAN,HAS_LONGTERM_LOAN,MAX_LTV_MO,MIN_LTV_MO,AVG_LTV_MO,CBAL_TO_INC_12MON,CBAL_TO_INC_9MON,CBAL_TO_INC_6MON,CBAL_TO_INC_3MON,INTEREST_12M,INTEREST,N_PAYMENT_GOC,N_PAYMENT_LAI,PRINPICAL_PYMT_FRQ_ID_MAX,INT_PYMT_FRQ_ID_MAX,PURCOD_MAX,PURCOD_MIN,CFORGD,CBALORG_MAX,CBALORG_MIN,CBALORG_AVG,SAMPLE_TYPE
154440,1000000,F,39,1,Single,1,0,0.242454,2018,1564967790,Private,39,30312543,40433626,47722500,43345909,60330953,29627058,237741934,213967740,225854837,237741934,3791261063,1.066495,1.010062e+09,1172188705,1028883572,1001205957,303018460,707043073,1,1,36,36,14,22,10.649865,14,12,1,5,12,1,0,24980608,24980608,0,7.08,6.607784,3.54,3.329296,2.19,2.095599,10,9.220424,15889130,11497355,13047141,1,1,1,1.148418,1.031856,1.090137,1.333891,1.382113,1.298909,1.167801,4616280,384690,8,4,1,1,3,3,C,47722500,42950250,45336375,TRAIN
518613,1000000,F,39,1,Single,1,0,0.242454,2019,1260465959,Private,40,16524852,15675810,17036446,17140585,19358518,13354580,105005053,94504547,99754800,105005053,3415822941,0.458918,6.415494e+08,652563349,593388246,654262550,192464835,449084615,1,1,36,36,25,11,8.042137,1,0,1,0,0,1,0,0,0,0,7.02,7.096477,2.79,3.022318,2.17,2.274983,9,9.407587,6550839,3430623,5694348,1,1,1,0.486653,0.379963,0.433308,0.948620,0.963487,0.919063,1.088176,1378469,114872,6,4,1,1,1,1,C,17036446,15332801,16184623,TRAIN
756324,1000000,F,39,1,Single,1,0,0.242454,2020,1231895728,Private,41,28548342,53003486,61960797,54222888,62515407,37666089,194257820,174832038,184544929,194257820,2225127842,2.382042,8.356894e+08,716022153,826304913,870002910,250706832,584982608,1,0,24,24,18,6,11.505939,27,24,1,16,24,1,0,43344659,43344659,0,2.91,2.689326,3.23,3.156089,2.48,2.307286,3,3.156250,25193091,13958345,13852050,1,1,1,2.488919,2.265843,2.377381,1.856622,1.841396,1.913204,1.775301,6238852,519904,9,11,1,1,2,2,C,61960797,55764717,58862757,TRAIN
1644495,1000000,F,39,1,Single,1,0,0.242454,2022,1329858197,Private,43,26549318,9238896,9915629,7677866,10448189,5241252,196905757,177215181,187060469,196905757,3327788152,0.277629,4.521403e+08,495291909,408566720,470818624,135642085,316498198,1,0,36,36,13,23,11.459239,15,12,1,7,12,1,0,5512384,5512384,0,8.02,7.981802,3.15,3.267831,2.32,2.313696,8,7.436385,3587501,3947985,1703410,1,1,1,0.298955,0.268200,0.283578,0.347990,0.331228,0.362221,0.329605,879824,73318,12,1,1,1,7,7,,9915629,8924066,9419847,OOT
1769095,1000000,F,39,1,Single,1,0,0.242454,2023,1094507856,Private,44,23531840,122642268,141819633,135984303,154862069,104379382,199993606,179994245,189993925,199993606,2557065714,4.796211,3.318030e+08,397619406,333464837,329767684,99540909,232262121,1,1,36,36,29,7,5.256469,22,18,1,12,18,1,0,99005333,99005333,0,5.05,5.087352,3.25,2.972741,2.28,2.252973,5,4.941112,73013257,18795738,30833273,1,1,1,4.925389,4.295702,4.610545,5.000000,5.124797,5.498154,5.045398,7147972,595664,12,11,1,1,8,8,C,141819633,127637669,134728651,OOT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856488,1399998,M,39,1,Married,1,0,0.426071,2023,1449727,Mass,44,17172758,9266496,10157962,8117309,9724963,6198401,168883237,151994913,160439075,168883237,1732730148,0.534792,1.274804e+06,1061162,1378010,1262451,382441,892362,1,1,12,12,10,2,8.424643,25,21,1,11,21,1,0,4996153,4996153,0,5.05,5.112037,3.25,3.106105,2.28,2.349513,5,4.702297,660027,4174892,4431577,1,1,1,0.587899,0.439615,0.513757,0.539604,0.519621,0.526245,0.507317,683854,56987,2,3,3,3,2,2,D,10157962,9142165,9650063,OOT
87830,1399999,M,33,4,Single,1,0,-0.262223,2018,33294262,Mass,33,69216862,524856830,559455650,627966105,706050455,524856830,1058495657,952646091,1005570874,1058495657,1617270897,32.453241,9.160478e+06,9692713,8598031,9002853,2748143,6412334,1,1,36,36,26,10,8.657423,0,0,1,0,0,1,0,0,0,0,7.08,7.557920,3.54,3.856980,2.19,2.158413,10,9.549167,164604471,267424083,92828276,1,1,1,35.642168,29.535462,32.588815,5.000000,5.147155,4.842481,4.367905,54365682,4530473,2,8,1,1,9,9,,559455650,503510085,531482867,TRAIN
516193,1399999,M,33,4,Single,1,0,-0.262223,2019,28992910,Mass,34,55149011,100689331,104407541,119544008,134373062,90055445,460821978,414739780,437780879,460821978,1506908504,6.681848,9.812312e+06,9705677,9502377,10009292,2943693,6868618,1,0,36,36,27,9,7.637167,30,27,2,16,27,2,0,61112266,61112266,0,7.02,6.981838,2.79,2.907303,2.17,2.183106,9,9.077034,42515521,21086544,37087266,1,1,1,6.839356,5.839870,6.339613,1.825769,1.775361,1.982572,1.594123,9129776,760814,8,4,1,1,8,8,C,104407541,93966786,99187163,TRAIN
757330,1399999,M,33,4,Single,1,0,-0.262223,2020,26120116,Mass,35,43705624,173586222,204610223,145245663,205885006,112957954,324381155,291943039,308162097,324381155,1616348266,10.739407,8.841456e+06,8053534,9284674,8817750,2652436,6189019,1,0,36,36,9,27,5.000000,36,30,2,13,30,2,0,156943496,156943496,0,2.91,2.756153,3.23,2.929189,2.48,2.351833,3,2.986342,97746864,62798775,13040583,1,1,1,11.199328,9.526112,10.362720,3.971714,3.899828,4.285419,4.110106,7262283,605190,9,3,6,6,9,9,D,204610223,184149200,194379711,TRAIN


In [17]:
df['INTEREST_12M'].max()

275223279

In [18]:
df.columns

Index(['SOCIF', 'C_GIOITINH', 'BASE_AGE', 'TRINHDO', 'TTHONNHAN', 'SOHUUNHA',
       'NHANVIENBIDV', 'INHERENT_RISK_SCORE', 'year', 'BASE_AUM',
       'final_CST_MKT_SEG', 'TUOI', 'INCOME', 'CBAL', 'CBALORG', 'CBAL_AVG',
       'CBAL_MAX', 'CBAL_MIN', 'AFLIMT_MAX', 'AFLIMT_MIN', 'AFLIMT_AVG',
       'AFLIMT_SUM', 'COLLATERAL_VALUE', 'LTV', 'N_AVG_DEPOSIT_12M',
       'N_AVG_DEPOSIT_3M', 'N_AVG_DEPOSIT_6M', 'N_AVG_DEPOSIT_9M',
       'N_AVG_DD_12M', 'N_AVG_CD_12M', 'FLAG_DEPOSIT', 'FLAG_SALARY_ACC',
       'DURATION_MAX', 'DURATION_AVG', 'REMAINING_DURATION_MAX',
       'TIME_TO_OP_MAX', 'RATE_AVG', 'MAX_DPD_12M', 'MAX_DPD_12M_OBS',
       'MAX_NHOMNOCIC', 'SUM_ALL_OD_12M', 'MAX_AFCPNO_OD_12M',
       'MAX_NHOMNOCUOI', 'XULYNO', 'N_AVG_OVERDUE_CBAL_12M',
       'N_MAX_OVERDUE_CBAL_12M', 'BAD', 'REAL_GDP', 'REAL_GDP_GROWTH_12M',
       'CPI', 'CPI_GROWTH_12M', 'UR', 'UR_GROWTH_12M', 'IIP', 'IIP_GROWTH_12M',
       'CBAL_SHORTTERM_LOAN', 'CBAL_MIDTERM_LOAN', 'CBAL_LONGTERM_LOAN',
       '

In [19]:
pd.set_option('display.max_columns', None)

In [41]:
df['INCOME'].min()

6371758

In [42]:
df['INCOME'].max()

107326986

In [40]:
df['CBAL'].value_counts()

CBAL
0           255227
37595521         4
35088836         3
17352325         3
56876361         3
             ...  
52983404         1
64495302         1
47786230         1
62096870         1
34240480         1
Name: count, Length: 1770268, dtype: int64