In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 01. train, test data 정제

## 01-1. 데이터 불러오기

In [2]:
# 데이터 불러오기
sec_timeline = pd.read_csv('./final/data/sec_timeline_scaled.csv')
trd_timeline = pd.read_csv('./final/data/trd_timeline_scaled.csv')
train_data = pd.read_csv('./final/data/lending_club_2020_train.csv')
test_data = pd.read_csv('./final/data/lending_club_2020_test.csv')

  train_data = pd.read_csv('./final/data/lending_club_2020_train.csv')


## 01-2. train data

In [3]:
## 01. 열 drop
train_data = train_data.drop(columns=["id",
                      "url",
                      "title",
                      "application_type",
                      "next_pymnt_d",
                      "policy_code",
                      "delinq_amnt",
                      "hardship_flag",
                      "zip_code",
                      "emp_title",
                      "mo_sin_old_il_acct",
                      "mo_sin_old_rev_tl_op",
                      "mths_since_recent_bc",
                      "mths_since_recent_inq",
                      "issue_d",
                      "purpose",
                      "addr_state",
                      "earliest_cr_line",
                      "initial_list_status",
                      "last_credit_pull_d",
                      "debt_settlement_flag",
                      "mo_sin_rcnt_rev_tl_op",
                      "mo_sin_rcnt_tl",
                      "last_pymnt_d",
                      "last_pymnt_amnt",
                      "pymnt_plan",
                      "total_rec_prncp",
                      "total_rec_int",
                      "total_pymnt_inv"]
                      )

In [4]:
## 02. 열 변형
# 01. grade, sub_grade
grade = np.array(train_data["grade"])
sub_grade = np.array(train_data["sub_grade"])
encoder = LabelEncoder()
grade_encoded = encoder.fit_transform(grade)
sub_grade_encoded = encoder.fit_transform(sub_grade)

train_data["grade"] = grade_encoded
train_data["sub_grade"] = sub_grade_encoded

# 02. emp_length
label_mapping = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10
}
train_data['emp_length'] = train_data['emp_length'].map(label_mapping)
train_data['emp_length'] = train_data['emp_length'].fillna(-1)

# 03. term
label_mapping = {
    ' 36 months': 36,
    ' 60 months': 60
}

train_data['term'] = train_data['term'].map(label_mapping)

# 04. float로 변경 : revol_util, int_rate
train_data["revol_util"] = train_data["revol_util"].str.rstrip("%").astype(float) / 100
train_data["int_rate"] = train_data["int_rate"].str.rstrip("%").astype(float) / 100

# 05. loan_status
train_data = train_data[train_data['loan_status'].isin(['Fully Paid', 'Charged Off'])]
train_data['loan_status'] = np.where(train_data['loan_status'] == 'Fully Paid', 0, 1)

# 06. dummy variable : home_ownership, verification_status
train_data = pd.get_dummies(train_data, columns=["home_ownership", "verification_status"], drop_first=True)


In [6]:
## 03. 결측치 처리
# 01. 40% 이상 결측치를 가지는 칼럼 drop
train_data = train_data.loc[:, (train_data.isnull().mean() < 0.4) | (train_data.columns == "loan_status")]

# 02. 채무불이행 이후에 발생하는 데이터를 가진 칼럼 삭제
train_data = train_data.drop(columns=["recoveries", "collection_recovery_fee", "collections_12_mths_ex_med", "tot_coll_amt", ])

# 03. loan_status에 결측치가 있는 행 삭제
train_data = train_data.dropna(subset=["loan_status"])

# 04. dti 값이 -1 또는 0.0인 행들을 제거
train_data = train_data[(train_data['dti'] != -1) & (train_data['dti'] != 0.0)]

# 05. 결측치는 중앙값으로
# pct_tl_nvr_dlq : 연체 경험 없는 계좌 비율
cols = ['pct_tl_nvr_dlq', 'percent_bc_gt_75']
train_data[cols] = train_data[cols].fillna(train_data[cols].mean())

cols = [
    'num_actv_rev_tl',
    'num_bc_sats',
    'num_bc_tl',
    'num_op_rev_tl',
    'num_rev_accts',
    'num_rev_tl_bal_gt_0',
    'num_sats',
    'bc_open_to_buy',
    'avg_cur_bal',
    'tot_cur_bal',
    'total_rev_hi_lim',
    'tot_hi_cred_lim',
    'total_bal_ex_mort',
    'total_bc_limit',
    'total_il_high_credit_limit',
    'num_accts_ever_120_pd',
    'mort_acc'
]
train_data[cols] = train_data[cols].fillna(train_data[cols].median())

train_data['revol_util'] = train_data['revol_util'].fillna(train_data['revol_util'].mean())

train_data['bc_util'] = train_data['bc_util'] / 100.0
train_data['bc_util'] = train_data['bc_util'].fillna(train_data['bc_util'].mean())

In [8]:
## 04. 파생변수 만들기
# 01. fico_avg : fico_range_low, fico_range_high의 평균
insert_loc = train_data.columns.get_loc('fico_range_low')
train_data.insert(insert_loc, 'fico_avg', (train_data['fico_range_low'] + train_data['fico_range_high']) / 2)
train_data = train_data.drop(columns=['fico_range_low', 'fico_range_high'])

# 02. recovery_rate : 상환액 / (상환기간 * 월상환액)
# 이거는 나중에 바뀔 수도 있음!
train_data['recovery_rate'] = train_data['total_pymnt'] / (train_data['term'] * train_data['installment'])
train_data['recovery_rate'] = train_data['recovery_rate'].clip(upper=1) # recovery_rate가 1을 넘어가는 경우 1로 변경
train_data = train_data.drop(columns=['total_pymnt', 'installment'])

In [9]:
train_data

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,grade,sub_grade,emp_length,annual_inc,loan_status,...,total_bc_limit,total_il_high_credit_limit,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,recovery_rate
0,6000.0,6000.0,6000.0,36.0,0.0797,0,4,2.0,45000.0,0,...,15000.0,56511.0,True,False,False,False,False,False,True,0.993053
2,23200.0,23200.0,23200.0,60.0,0.2499,4,23,10.0,110000.0,1,...,20300.0,291465.0,True,False,False,False,False,False,True,0.210523
5,16000.0,16000.0,16000.0,36.0,0.0707,0,1,-1.0,65000.0,1,...,59100.0,49339.0,True,False,False,False,False,False,False,0.663020
6,4500.0,4500.0,4500.0,36.0,0.1042,1,7,5.0,50000.0,0,...,9300.0,11970.0,True,False,False,False,False,False,False,0.976153
7,20000.0,20000.0,20000.0,36.0,0.0999,1,7,10.0,60000.0,0,...,11000.0,73090.0,True,False,False,False,False,True,False,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1755288,26500.0,26500.0,26450.0,60.0,0.1727,2,14,2.0,60000.0,0,...,18900.0,16784.0,True,False,False,False,False,False,True,0.965027
1755290,3600.0,3600.0,3600.0,36.0,0.1999,4,20,7.0,33800.0,1,...,6500.0,13375.0,True,False,False,False,False,False,False,0.097397
1755291,19000.0,19000.0,18875.0,36.0,0.0668,0,2,4.0,71000.0,0,...,17600.0,43550.0,True,False,False,False,False,False,False,0.980974
1755292,9600.0,9600.0,9600.0,36.0,0.1433,2,10,9.0,44000.0,0,...,4200.0,10414.0,True,False,False,False,False,False,True,0.993780


In [10]:
## 06. 데이터 행 갯수
train_data.dropna(inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1033364 entries, 0 to 1755293
Data columns (total 63 columns):
 #   Column                               Non-Null Count    Dtype  
---  ------                               --------------    -----  
 0   loan_amnt                            1033364 non-null  float64
 1   funded_amnt                          1033364 non-null  float64
 2   funded_amnt_inv                      1033364 non-null  float64
 3   term                                 1033364 non-null  float64
 4   int_rate                             1033364 non-null  float64
 5   grade                                1033364 non-null  int32  
 6   sub_grade                            1033364 non-null  int32  
 7   emp_length                           1033364 non-null  float64
 8   annual_inc                           1033364 non-null  float64
 9   loan_status                          1033364 non-null  int32  
 10  dti                                  1033364 non-null  float64
 11  del

In [11]:
# sec_timeline과 trd_timeline의 feature 행 추출
sec_features = sec_timeline.columns.tolist()
trd_features = trd_timeline.columns.tolist()

print("SEC Timeline Features:", sec_features)
print("TRD Timeline Features:", trd_features)


SEC Timeline Features: ['term', 'emp_length', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_avg', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'acc_now_delinq', 'tot_cur_bal', 'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'mort_acc', 'num_accts_ever_120_pd', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'home_ownership_MORTGAGE', 'home_ownership_NONE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'verification_status_Source Verified', 'verification_status_Verified', 'loan_status', 'grade', 'sub_grade', 'loan_amnt', 'int_rate']
TRD Timeline Features: ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp

In [4]:
# # lending_club 데이터에서 trd_timeline의 열 추출
# train_trd = train_data[trd_features]
# test_trd = test_data[trd_features]

KeyError: "['fico_avg', 'home_ownership_MORTGAGE', 'home_ownership_NONE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'verification_status_Source Verified', 'verification_status_Verified'] not in index"

## 01-3. test data

# 02. T-bill 데이터 정제

In [27]:
tbill = pd.read_csv('/Users/hyunbin/Library/CloudStorage/OneDrive-Personal/05 빅데이터 전문가 과정/01 통계 데이터 사이언스/00 팀플/final/new/00 data/t-bill_3M.csv')

In [28]:
# 1. NaN 값이 있는 행 제거 후 datetime으로 변환
tbill = tbill.dropna(subset=['observation_date', 'DTB3'])  # observation_date와 DTB3 열의 NaN 값이 있는 행 제거
tbill['observation_date'] = pd.to_datetime(tbill['observation_date'])

  tbill['observation_date'] = pd.to_datetime(tbill['observation_date'])


In [29]:
# 2. 각 달의 15일 데이터 또는 해당 달의 중앙값 사용
# 먼저 15일 데이터 추출
tbill_15 = tbill[tbill['observation_date'].dt.day == 15]

# 각 연도-월별 그룹 생성 및 처리
result_rows = []

# 각 그룹별로 처리
for (year, month), group in tbill.groupby([
    tbill['observation_date'].dt.year,
    tbill['observation_date'].dt.month
]):
    # 해당 연도-월에 15일 데이터가 있는지 확인
    has_15th = any((group['observation_date'].dt.day == 15))
    
    if has_15th:
        # 15일 데이터가 있으면 그대로 사용
        result_rows.append(group[group['observation_date'].dt.day == 15].iloc[0])
    else:
        # 15일 데이터가 없으면 해당 달의 데이터를 정렬하고 중앙값 선택
        sorted_group = group.sort_values('observation_date')
        middle_idx = len(sorted_group) // 2
        result_rows.append(sorted_group.iloc[middle_idx])

# 결과를 데이터프레임으로 변환하고 날짜순 정렬
tbill_15 = pd.DataFrame(result_rows)
tbill_15 = tbill_15.sort_values('observation_date').reset_index(drop=True)

In [30]:
# 3. 날짜 형식 변경 (YYYY-MM-DD -> MMM-YY)
tbill_15['observation_date'] = tbill_15['observation_date'].dt.strftime('%b-%y')

In [31]:

# 4. CSV 파일로 저장
tbill_15.to_csv('/Users/hyunbin/Library/CloudStorage/OneDrive-Personal/05 빅데이터 전문가 과정/01 통계 데이터 사이언스/00 팀플/final/new/00 data/tbill_15_mod.csv', index=False)