In [1]:
import pandas as pd
import numpy as np

In [2]:
# csv 읽기
file_path = 'sec_timeline_recovery.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,term,emp_length,annual_inc,dti,delinq_2yrs,fico_avg,inq_last_6mths,open_acc,pub_rec,revol_bal,...,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,loan_status,grade,sub_grade,loan_amnt,total_pymnt,installment,int_rate
0,36,2.0,45000.0,8.67,1.0,757.0,0.0,14.0,0.0,3090.0,...,False,False,True,0,0,4,6000.0,6718.84,187.94,7.97%
1,60,10.0,110000.0,34.7,1.0,672.0,1.0,24.0,0.0,16909.0,...,False,False,True,1,4,23,23200.0,8599.68,680.82,24.99%
2,36,-1.0,65000.0,17.74,0.0,822.0,0.0,18.0,0.0,3881.0,...,False,False,False,1,0,1,16000.0,11804.28,494.55,7.07%
3,36,5.0,50000.0,6.99,0.0,702.0,0.0,7.0,2.0,6824.0,...,False,False,False,0,1,7,4500.0,5134.174383,146.1,10.42%
4,36,10.0,60000.0,28.94,0.0,677.0,0.0,19.0,0.0,16626.0,...,False,True,False,0,1,7,20000.0,23278.037543,645.25,9.99%


In [3]:
# 각 열의 데이터 타입 확인
print("데이터 타입 정보:")
print(df.dtypes)

# # 각 열의 고유값 확인
# for column in df.columns:
#     print(f"\n{column}의 고유값:")
#     print(df[column].unique())

데이터 타입 정보:
term                                     int64
emp_length                             float64
annual_inc                             float64
dti                                    float64
delinq_2yrs                            float64
fico_avg                               float64
inq_last_6mths                         float64
open_acc                               float64
pub_rec                                float64
revol_bal                              float64
revol_util                             float64
total_acc                              float64
acc_now_delinq                         float64
tot_cur_bal                            float64
total_rev_hi_lim                       float64
avg_cur_bal                            float64
bc_open_to_buy                         float64
bc_util                                float64
chargeoff_within_12_mths               float64
mort_acc                               float64
num_accts_ever_120_pd                  float64
nu

In [4]:
# 회수율 열 만들기
df['recovery_rate'] = df['total_pymnt'] / (df['term'] * df['installment']) # 계산식 : 상환액 / (상환기간 * 월상환액)
df['recovery_rate'] = df['recovery_rate'].clip(upper=1) # recovery_rate가 1을 넘어가는 경우 1로 변경

In [6]:
# 열 drop 시키기
columns_to_drop = ['int_rate', 'total_pymnt', 'term', 'installment', 'loan_status', 'loan_amnt']
df = df.drop(columns=columns_to_drop)

In [7]:
# 스케일링
# 1. term: 36, 60 -> 이진 스케일링 (36: 0, 60: 1)
#df['term'] = df['term'].apply(lambda x: 0 if x == 36 else 1 if x == 60 else x)

# 2. emp_length: (ordinal) 단순 Min-Max 스케일링  
df['emp_length'] = (df['emp_length'] - df['emp_length'].min()) / (df['emp_length'].max() - df['emp_length'].min())

# 3. annual_inc: 로그 후 Min-Max 스케일링
df['annual_inc'] = np.log1p(df['annual_inc'])
df['annual_inc'] = (df['annual_inc'] - df['annual_inc'].min()) / (df['annual_inc'].max() - df['annual_inc'].min())

# 4. dti: 로그 후 Min-Max 스케일링
df['dti'] = np.log1p(df['dti'])
df['dti'] = (df['dti'] - df['dti'].min()) / (df['dti'].max() - df['dti'].min())

# 5. delinq_2yrs: 이진화 (0이면 0, 그 외 1)
df['delinq_2yrs'] = (df['delinq_2yrs'] > 0).astype(int)

# 6. fico_avg: Min-Max 스케일링
df['fico_avg'] = (df['fico_avg'] - df['fico_avg'].min()) / (df['fico_avg'].max() - df['fico_avg'].min())

# 7. inq_last_6mths: 이진화 (0이면 0, 그 외 1)
df['inq_last_6mths'] = (df['inq_last_6mths'] > 0).astype(int)

# 8. open_acc: (ordinal) Min-Max 스케일링
df['open_acc'] = (df['open_acc'] - df['open_acc'].min()) / (df['open_acc'].max() - df['open_acc'].min())

# 9. pub_rec: 이진화 (0이면 0, 그 외 1)
df['pub_rec'] = (df['pub_rec'] > 0).astype(int)

# 10. revol_bal: 로그 후 Min-Max 스케일링
df['revol_bal'] = np.log1p(df['revol_bal'])
df['revol_bal'] = (df['revol_bal'] - df['revol_bal'].min()) / (df['revol_bal'].max() - df['revol_bal'].min())

# 11. revol_util: 로그 후 Min-Max 스케일링
df['revol_util'] = np.log1p(df['revol_util'])
df['revol_util'] = (df['revol_util'] - df['revol_util'].min()) / (df['revol_util'].max() - df['revol_util'].min())

# 12. total_acc: (ordinal) Min-Max 스케일링
df['total_acc'] = (df['total_acc'] - df['total_acc'].min()) / (df['total_acc'].max() - df['total_acc'].min())

# 13. acc_now_delinq: 이진화 (0이면 0, 그 외 1)
df['acc_now_delinq'] = (df['acc_now_delinq'] > 0).astype(int)

# 14. tot_cur_bal: 로그 후 Min-Max 스케일링
df['tot_cur_bal'] = np.log1p(df['tot_cur_bal'])
df['tot_cur_bal'] = (df['tot_cur_bal'] - df['tot_cur_bal'].min()) / (df['tot_cur_bal'].max() - df['tot_cur_bal'].min())

# 15. total_rev_hi_lim: Min-Max 스케일링 (로그 미적용)
df['total_rev_hi_lim'] = (df['total_rev_hi_lim'] - df['total_rev_hi_lim'].min()) / (df['total_rev_hi_lim'].max() - df['total_rev_hi_lim'].min())

# 16. avg_cur_bal: Min-Max 스케일링
df['avg_cur_bal'] = (df['avg_cur_bal'] - df['avg_cur_bal'].min()) / (df['avg_cur_bal'].max() - df['avg_cur_bal'].min())

# 17. bc_open_to_buy: 로그 후 Min-Max 스케일링
df['bc_open_to_buy'] = np.log1p(df['bc_open_to_buy'])
df['bc_open_to_buy'] = (df['bc_open_to_buy'] - df['bc_open_to_buy'].min()) / (df['bc_open_to_buy'].max() - df['bc_open_to_buy'].min())

# 18. bc_util: 로그 후 Min-Max 스케일링
df['bc_util'] = np.log1p(df['bc_util'])
df['bc_util'] = (df['bc_util'] - df['bc_util'].min()) / (df['bc_util'].max() - df['bc_util'].min())

# 19. chargeoff_within_12_mths: 이진화 (0이면 0, 그 외 1)
df['chargeoff_within_12_mths'] = (df['chargeoff_within_12_mths'] > 0).astype(int)

# 20. mort_acc: (ordinal) Min-Max 스케일링
df['mort_acc'] = (df['mort_acc'] - df['mort_acc'].min()) / (df['mort_acc'].max() - df['mort_acc'].min())

# 21. num_accts_ever_120_pd: 이진화 (0이면 0, 그 외 1)
df['num_accts_ever_120_pd'] = (df['num_accts_ever_120_pd'] > 0).astype(int)

# 22. num_actv_rev_tl: (ordinal) Min-Max 스케일링
df['num_actv_rev_tl'] = (df['num_actv_rev_tl'] - df['num_actv_rev_tl'].min()) / (df['num_actv_rev_tl'].max() - df['num_actv_rev_tl'].min())

# 23. num_bc_sats: (ordinal) Min-Max 스케일링
df['num_bc_sats'] = (df['num_bc_sats'] - df['num_bc_sats'].min()) / (df['num_bc_sats'].max() - df['num_bc_sats'].min())

# 24. num_bc_tl: (ordinal) Min-Max 스케일링
df['num_bc_tl'] = (df['num_bc_tl'] - df['num_bc_tl'].min()) / (df['num_bc_tl'].max() - df['num_bc_tl'].min())

# 25. num_op_rev_tl: (ordinal) Min-Max 스케일링
df['num_op_rev_tl'] = (df['num_op_rev_tl'] - df['num_op_rev_tl'].min()) / (df['num_op_rev_tl'].max() - df['num_op_rev_tl'].min())

# 26. num_rev_accts: (ordinal) Min-Max 스케일링
df['num_rev_accts'] = (df['num_rev_accts'] - df['num_rev_accts'].min()) / (df['num_rev_accts'].max() - df['num_rev_accts'].min())

# 27. num_rev_tl_bal_gt_0: (ordinal) Min-Max 스케일링
df['num_rev_tl_bal_gt_0'] = (df['num_rev_tl_bal_gt_0'] - df['num_rev_tl_bal_gt_0'].min()) / (df['num_rev_tl_bal_gt_0'].max() - df['num_rev_tl_bal_gt_0'].min())

# 28. num_sats: (ordinal) Min-Max 스케일링
df['num_sats'] = (df['num_sats'] - df['num_sats'].min()) / (df['num_sats'].max() - df['num_sats'].min())

# 29. pct_tl_nvr_dlq: 로그 후 Min-Max 스케일링
df['pct_tl_nvr_dlq'] = np.log1p(df['pct_tl_nvr_dlq'])
df['pct_tl_nvr_dlq'] = (df['pct_tl_nvr_dlq'] - df['pct_tl_nvr_dlq'].min()) / (df['pct_tl_nvr_dlq'].max() - df['pct_tl_nvr_dlq'].min())

# 30. percent_bc_gt_75: 로그 후 Min-Max 스케일링
df['percent_bc_gt_75'] = np.log1p(df['percent_bc_gt_75'])
df['percent_bc_gt_75'] = (df['percent_bc_gt_75'] - df['percent_bc_gt_75'].min()) / (df['percent_bc_gt_75'].max() - df['percent_bc_gt_75'].min())

# 31. pub_rec_bankruptcies: 이진화 (0이면 0, 그 외 1)
df['pub_rec_bankruptcies'] = (df['pub_rec_bankruptcies'] > 0).astype(int)

# 32. tax_liens: 이진화 (0이면 0, 그 외 1)
df['tax_liens'] = (df['tax_liens'] > 0).astype(int)

# 33. tot_hi_cred_lim: 로그 후 Min-Max 스케일링
df['tot_hi_cred_lim'] = np.log1p(df['tot_hi_cred_lim'])
df['tot_hi_cred_lim'] = (df['tot_hi_cred_lim'] - df['tot_hi_cred_lim'].min()) / (df['tot_hi_cred_lim'].max() - df['tot_hi_cred_lim'].min())

# 34. total_bal_ex_mort: 로그 후 Min-Max 스케일링
df['total_bal_ex_mort'] = np.log1p(df['total_bal_ex_mort'])
df['total_bal_ex_mort'] = (df['total_bal_ex_mort'] - df['total_bal_ex_mort'].min()) / (df['total_bal_ex_mort'].max() - df['total_bal_ex_mort'].min())

# 35. total_bc_limit: 로그 후 Min-Max 스케일링
df['total_bc_limit'] = np.log1p(df['total_bc_limit'])
df['total_bc_limit'] = (df['total_bc_limit'] - df['total_bc_limit'].min()) / (df['total_bc_limit'].max() - df['total_bc_limit'].min())

# 36. total_il_high_credit_limit: 로그 후 Min-Max 스케일링
df['total_il_high_credit_limit'] = np.log1p(df['total_il_high_credit_limit'])
df['total_il_high_credit_limit'] = (df['total_il_high_credit_limit'] - df['total_il_high_credit_limit'].min()) / (df['total_il_high_credit_limit'].max() - df['total_il_high_credit_limit'].min())

# 이후 나머지 변수(더미 변수, 타겟 변수 등)는 그대로 둡니다.


In [8]:
# 데이터 저장
processed_file_path = 'sec_timeline_recovery_mod.csv'
df.to_csv(processed_file_path, index=False)

In [None]:
emp_length, dti, revol_util, fico_avg, total_acc, total_bc_limit, int_rate, bc_open_to_buy, home_ownership_MORTGAGE, num_rev_tl_bal_gt_0, annual_inc, percent_bc_gt_75, pub_rec_bankruptcies, num_accts_ever_120_pd, num_sats, loan_status, acc_now_delinq, bc_util, total_rev_hi_lim, home_ownership_OWN