In [8]:
import pandas as pd
import numpy as np
import string
from sklearn.preprocessing import StandardScaler
from google.colab import drive

# === 1. 데이터 로드 ===
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/lending_club_2020_train.csv의 사본"
df = pd.read_csv(file_path, low_memory=False)

# === 2. 종속변수 생성 ===
def map_loan_status(status):
    if status in ['Charged Off', 'Default']:
        return 1
    elif status == 'Fully Paid':
        return 0
    else:
        return np.nan

df['default'] = df['loan_status'].apply(map_loan_status)
df = df[df['default'].notnull()].copy()

# === 3. 불필요한 변수 삭제 ===
drop_cols = [
    'hardship_loan_status', 'hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term',
    'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_length',
    'hardship_dpd', 'orig_projected_additional_accrued_interest', 'hardship_amount',
    'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'sec_app_revol_util',
    'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_earliest_cr_line',
    'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_open_act_il',
    'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',
    'annual_inc_joint', 'dti_joint', 'mths_since_last_record', 'mths_since_recent_bc_dlq',
    'mths_since_last_major_derog', 'next_pymnt_d', 'inq_fi', 'total_cu_tl', 'emp_title',
    'num_actv_bc_tl', 'hardship_flag', 'title', 'last_pymnt_d', 'collection_recovery_fee',
    'earliest_cr_line', 'funded_amnt', 'funded_amnt_inv', 'id', 'initial_list_status', 'installment',
    'int_rate', 'issue_d', 'last_credit_pull_d', 'last_pymnt_amnt', 'loan_amnt', 'loan_status',
    'out_prncp', 'out_prncp_inv', 'policy_code', 'pymnt_plan', 'recoveries', 'term',
    'total_pymnt', 'total_pymnt_inv', 'total_rec_int', 'total_rec_late_fee', 'total_rec_prncp',
    'url', 'zip_code', 'debt_settlement_flag', 'desc', 'member_id', 'verified_status_joint',
    'sec_app_mths_since_last_major_derog', 'disbursement_method', 'debt_settlement_flag_date',
    'settlement_status', 'settlement_date', 'settlement_amount', 'settlement_percentage',
    'settlement_term'
]
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

# === 4. emp_length 전처리 ===
def process_emp_length(x):
    if pd.isna(x):
        return np.nan
    elif '< 1' in x:
        return 0.5
    elif '10+' in x:
        return 10.0
    else:
        extracted = pd.to_numeric(pd.Series(x).str.extract(r'(\d+)')[0], errors='coerce')
        return extracted.iloc[0]

if 'emp_length' in df.columns:
    df['emp_length'] = df['emp_length'].apply(process_emp_length)

# === 5. revol_util 전처리 ===
def preprocess_revol_util(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip().replace('%', '')
    try:
        return float(x)
    except:
        return np.nan

if 'revol_util' in df.columns:
    df['revol_util'] = df['revol_util'].apply(preprocess_revol_util)

# === 6. grade, sub_grade 수치화 ===
grade_map = {k: v for v, k in enumerate(['A','B','C','D','E','F','G'], start=1)}
if 'grade' in df.columns:
    df['grade_num'] = df['grade'].map(grade_map)

sub_grades = [f"{l}{n}" for l in string.ascii_uppercase[:7] for n in range(1, 6)]
sub_grade_map = {k: v for v, k in enumerate(sub_grades, start=1)}
if 'sub_grade' in df.columns:
    df['sub_grade_num'] = df['sub_grade'].map(sub_grade_map)

# === 7. 파생 변수 → 원래 변수 이름으로 교체 ===
if 'grade_num' in df.columns:
    df['grade'] = df['grade_num']
    df.drop(columns=['grade_num'], inplace=True)

if 'sub_grade_num' in df.columns:
    df['sub_grade'] = df['sub_grade_num']
    df.drop(columns=['sub_grade_num'], inplace=True)

# === 8. 문자형 변수 더미화 (0/1 처리) ===
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
df[cat_cols] = df[cat_cols].fillna('Missing')
df = pd.get_dummies(df, columns=cat_cols, drop_first=False, dtype='uint8')

# === 9. 숫자형 결측치 처리 (결측률 기준 분기) ===
num_cols = df.select_dtypes(include=['float64', 'int64']).drop(columns=['default']).columns
missing_ratios = df[num_cols].isnull().mean()

high_missing_cols = missing_ratios[missing_ratios >= 0.1].index.tolist()
low_missing_cols = missing_ratios[(missing_ratios > 0) & (missing_ratios < 0.1)].index.tolist()

# 결측 여부 이진 변수 (결측률 높은 것만)
for col in high_missing_cols:
    df[col + '_missing'] = df[col].isnull().astype(int)
    df[col] = df[col].fillna(0)

# 결측률 낮은 것: 평균 대체
for col in low_missing_cols:
    df[col] = df[col].fillna(df[col].mean())

# === 11. 메모리 정리 ===
df = df.copy()

# === 12. 저장 ===
df.to_csv("lending_club_processed2.csv", index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
file_path = "/content/lending_club_processed2.csv"
df = pd.read_csv(file_path, low_memory=False)

df.head()

Unnamed: 0,grade,sub_grade,emp_length,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,...,mths_since_rcnt_il_missing,total_bal_il_missing,il_util_missing,open_rv_12m_missing,open_rv_24m_missing,max_bal_bc_missing,all_util_missing,inq_last_12m_missing,mths_since_recent_inq_missing,mths_since_recent_revol_delinq_missing
0,1,5,2.0,45000.0,8.67,1.0,755.0,759.0,0.0,22.0,...,0,0,0,0,0,0,0,0,0,1
1,5,24,10.0,110000.0,34.7,1.0,670.0,674.0,1.0,10.0,...,0,0,0,0,0,0,0,0,0,0
2,1,2,5.970756,65000.0,17.74,0.0,820.0,824.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,2,8,5.0,50000.0,6.99,0.0,700.0,704.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,2,8,10.0,60000.0,28.94,0.0,675.0,679.0,0.0,40.0,...,1,1,1,1,1,1,1,1,0,0
