In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


### 1. 방문전 발생하는 데이터 feature selection(44개 + One hot encoding)


| Features                 |                            |                         |                              |
|--------------------------|----------------------------|-------------------------|------------------------------|
| term                     | emp_length                 | annual_inc              | dti                          |
| delinq_2yrs              | fico_fico_avg              | inq_last_6mths          | open_acc                     |
| pub_rec                  | revol_bal                  | revol_util              | total_acc                    |
| acc_now_delinq           | tot_cur_bal                | mths_since_rcnt_il      | total_bal_il                 |
| il_util                  | max_bal_bc                 | all_util                | total_rev_hi_lim             |
| total_cu_tl              | avg_cur_bal                | bc_open_to_buy          | bc_util                      |
| chargeoff_within_12_mths | mort_acc                   | num_accts_ever_120_pd   | num_actv_rev_tl              |
| num_bc_sats              | num_bc_tl                  | num_op_rev_tl           | num_rev_accts                |
| num_rev_tl_bal_gt_0      | num_sats                   | pct_tl_nvr_dlq          | percent_bc_gt_75             |
| pub_rec_bankruptcies     | tax_liens                  | tot_hi_cred_lim         | total_bal_ex_mort            |
| total_bc_limit           | total_il_high_credit_limit | home_ownership(One-hot) | verification_status(One-hot) |

### 해당 feature들로 loan_status, loan_status_prop 예측. loan_status_prop는 데이터에는 없는 변수로, 모델의 laon_status 분류 확률값을 의미

In [10]:
temp = pd.read_csv("./trd_timeline.csv")

In [41]:
train_col = list(temp.columns)
train_col.append("issue_d")
train_col


['loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_length',
 'annual_inc',
 'loan_status',
 'dti',
 'delinq_2yrs',
 'fico_avg',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'total_rec_late_fee',
 'last_fico_range_high',
 'last_fico_range_low',
 'acc_now_delinq',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'mort_acc',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_rev_tl_bal_gt_0',
 'num_sats',
 'num_tl_120dpd_2m',
 'num_tl_30dpd',
 'num_tl_90g_dpd

In [50]:
df = pd.read_csv("./lending_club_2020_test.csv")


In [45]:
df

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,...,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag
0,3697367,20000.0,20000.0,20000.0,36 months,6.03%,608.72,A,A1,RANDALL J STOLL CPA PC,...,,,,,,,,,,N
1,154429912,14950.0,14950.0,14950.0,60 months,17.74%,377.53,C,C5,,...,,,,,,,,,,N
2,163922381,19125.0,19125.0,19125.0,36 months,8.81%,606.49,A,A5,Machine operator,...,,,,,,,,,,N
3,13046178,12000.0,12000.0,12000.0,60 months,14.64%,283.22,C,C3,Tech Locator,...,,,,,,,,,,N
4,64038304,16000.0,16000.0,16000.0,60 months,9.99%,339.88,B,B3,Managing Dentist,...,,,,,,,,,,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170193,88925017,6000.0,6000.0,6000.0,36 months,12.79%,201.56,C,C1,,...,,,,,,,,,,N
1170194,38449284,9600.0,9600.0,9600.0,36 months,9.49%,307.48,B,B2,Production Team Leader,...,,,,,,,,,,N
1170195,119255932,8400.0,8400.0,8400.0,36 months,24.85%,333.32,E,E3,Mannager,...,,,,,,,,,,N
1170196,128462821,5000.0,5000.0,5000.0,36 months,16.02%,175.84,C,C5,GROUND MAINTENANCE,...,,,,,,,,,,N


In [51]:
# loan_status : 부도 여부, 타겟 변수
# non-default = 0, default = 1
# 'Fully Paid'와 'Charged Off'가 아닌 행 제거
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
df['loan_status'] = np.where(df['loan_status'] == 'Fully Paid', 0, 1)

# fico_range_low
# data cleansing. add avg fico score column 
insert_loc = df.columns.get_loc('fico_range_low')
df.insert(insert_loc, 'fico_avg', (df['fico_range_low'] + df['fico_range_high']) / 2)

# emp_length
# 2 way of emp_length mapping.
# emp_length이 NaN인 값(무직)을 -1로 매핑, 나머지를 오름차순으로 0~10
# emp_length이 NaN인 값(무직)을 -1로 매핑, 나머지를 오름차순으로 1~11. 무직과 고용상태를 구분하기 위함
label_mapping = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10
}

df['emp_length'] = df['emp_length'].map(label_mapping)
df['emp_length'] = df['emp_length'].fillna(-1)

# term
# categorical variable Labeling
label_mapping = {
    ' 36 months': 36,
    ' 60 months': 60
}

df['term'] = df['term'].map(label_mapping)

# grade, sub_grade labeling
grade = np.array(df["grade"])
sub_grade = np.array(df["sub_grade"])

encoder = LabelEncoder()
grade_encoded = encoder.fit_transform(grade)
sub_grade_encoded = encoder.fit_transform(sub_grade)

df["grade"] = grade_encoded
df["sub_grade"] = sub_grade_encoded

# int_rate
df["int_rate"] = df["int_rate"].str.rstrip("%").astype(float)

# Qualitative var labeling(nominal var. without ordinal var)
df = pd.get_dummies(df, columns=["home_ownership", "verification_status"], drop_first=True)

In [52]:
df = df[train_col]
df

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,annual_inc,...,total_bc_limit,total_il_high_credit_limit,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,issue_d
0,20000.0,20000.0,20000.0,36,6.03,608.72,0,0,6.0,125000.0,...,49000.0,46184.0,True,False,False,False,False,True,False,Mar-2013
3,12000.0,12000.0,12000.0,60,14.64,283.22,2,12,0.0,60000.0,...,11400.0,5439.0,False,False,False,False,True,True,False,Mar-2014
6,15000.0,15000.0,15000.0,36,5.32,451.73,0,0,1.0,75000.0,...,17400.0,18876.0,True,False,False,False,False,False,False,Feb-2016
7,20000.0,20000.0,20000.0,36,13.05,674.37,1,9,1.0,87500.0,...,25900.0,56573.0,False,False,False,True,False,False,True,Aug-2013
10,11150.0,11150.0,11150.0,36,13.99,381.03,2,13,3.0,63498.0,...,27500.0,140853.0,False,False,False,False,True,True,False,Apr-2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170192,6000.0,6000.0,6000.0,36,6.99,185.24,0,1,4.0,62000.0,...,0.0,122354.0,False,False,False,False,True,True,False,Apr-2017
1170193,6000.0,6000.0,6000.0,36,12.79,201.56,2,10,-1.0,34000.0,...,9100.0,40364.0,True,False,False,False,False,False,True,Sep-2016
1170194,9600.0,9600.0,9600.0,36,9.49,307.48,1,6,10.0,60000.0,...,10000.0,2028.0,False,False,False,True,False,False,True,Jan-2015
1170195,8400.0,8400.0,8400.0,36,24.85,333.32,4,22,0.0,30000.0,...,8000.0,0.0,False,False,False,False,True,False,False,Oct-2017


In [53]:
# 각 컬럼의 결측치 개수 계산
missing_counts = df.isnull().sum()

# 결측치가 있는 컬럼만 선택
missing_counts = missing_counts[missing_counts > 0]

# 결측치 비율 계산 (전체 행 수에 대한 비율)
missing_ratios = missing_counts / len(df)

# 결측치 개수와 비율을 하나의 DataFrame으로 생성한 후, 내림차순 정렬
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Ratio': missing_ratios
}).sort_values(by='Missing Count', ascending=False)

missing_df

Unnamed: 0,Missing Count,Missing Ratio
mths_since_rcnt_il,350148,0.470349
all_util,338649,0.454903
open_acc_6m,338593,0.454827
total_cu_tl,338593,0.454827
inq_last_12m,338593,0.454827
open_act_il,338592,0.454826
open_il_12m,338592,0.454826
open_il_24m,338592,0.454826
open_rv_12m,338592,0.454826
open_rv_24m,338592,0.454826


In [54]:
df.to_csv("./test_trd_timeline.csv", index=False)