In [1]:
import pandas as pd
import pickle
import os

In [2]:
%cd "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/tablegan" 

C:\Users\GCU\Lending_club\Data_Analysis_lending-club\tablegan


In [3]:
def cat_to_num(df, sep=',', cat_names=None):
    if cat_names is None: cat_names = []
    subs = {}
    df_num = df.copy()
    
    # TRANSFORM TO SET TO PREVENT DOUBLE FACTORIZATION
    for z in set(df_num.select_dtypes(include=['object']).columns.tolist() + cat_names):
        y, label = pd.factorize(df[z])
        subs[z] = {'y': y, 'label': label}
        df_num[z] = y
    return df_num, subs

In [4]:
def transform(ds: str, suffix: str = '', sep: str = ',', ext: str = 'csv',
              drop: list = None, cat_names: list = None, to_disk: bool = True,
              target: str = '', d_basepath: str = 'data'):
    
    if target == '':
        target = False
    if drop is None:
        drop = []
    if cat_names is None:
        cat_names = []

    base_fname = f'./{d_basepath}/{ds}/{ds}'
    source_fname = f'{base_fname}{suffix}.{ext}'
    print(f'Basepath: {base_fname}')
    print(f'Source file: {source_fname}')

    # ✅ CSV 파일 읽기 및 오류 처리
    try:
        df = pd.read_csv(source_fname, sep=sep, quotechar='"', error_bad_lines=False)
    except pd.errors.ParserError as e:
        print(f"❌ CSV 파싱 오류: {e}")
        return None, None
    except Exception as e:
        print(f"❌ CSV 파일 로드 중 오류 발생: {e}")
        return None, None

    # ✅ 문자열 내 쉼표 및 리스트 형태 처리
    def convert_to_number(value):
        if isinstance(value, str):
            value = value.strip().replace('"', '')
            try:
                # 문자열 리스트 처리: "[1.0, 2.0]" → [1.0, 2.0]
                if value.startswith('[') and value.endswith(']'):
                    return ast.literal_eval(value)
                return float(value)  # 일반 숫자 문자열 처리
            except Exception:
                return value  # 변환 실패 시 원래 값 반환
        return value

    df = df.applymap(convert_to_number)

    # ✅ 열 삭제
    df = df.drop(drop, axis=1, errors='ignore')

    # ✅ 범주형 변수 숫자 변환
    try:
        df_num, subs = cat_to_num(df, cat_names=cat_names)
    except Exception as e:
        print(f"❌ cat_to_num 함수 오류: {e}")
        return None, None

    # ✅ 변환 정보 저장
    try:
        with open(f'{d_basepath}/{ds}/subs.pkl', 'wb') as f:
            pickle.dump(subs, f)
    except Exception as e:
        print(f"❌ subs.pkl 저장 오류: {e}")

    # ✅ 타겟 분리 및 저장
    if target:
        y = df_num[target]
        df_num = df_num.drop([target], axis=1)

    if to_disk:
        if target:
            target_fname_y = f'{base_fname}_labels.csv'
            print(f'✅ 타겟 레이블 파일 저장: {target_fname_y}')
            y.to_csv(target_fname_y, sep=',', index=False)
        
        target_fname = f'{base_fname}_encoding.csv'
        print(f'✅ 변환된 데이터 저장: {target_fname}')
        df_num.to_csv(target_fname, sep=',', index=False)

    if target:
        return df_num, y, subs
    return df_num, subs


In [5]:
cat_names = ['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status',
       'addr_state', 'initial_list_status', 'application_type',
       'hardship_flag', 'debt_settlement_flag', 'loan_status']

In [7]:
a, b, c = transform('defalut_original_train_before_2017_label_defalut', suffix='', to_disk=True, target='loan_status',cat_names=cat_names)
a.head()

Basepath: ./data/defalut_original_train_before_2017_label_defalut/defalut_original_train_before_2017_label_defalut
Source file: ./data/defalut_original_train_before_2017_label_defalut/defalut_original_train_before_2017_label_defalut.csv




  """Entry point for launching an IPython kernel.


✅ 타겟 레이블 파일 저장: ./data/defalut_original_train_before_2017_label_defalut/defalut_original_train_before_2017_label_defalut_labels.csv
✅ 변환된 데이터 저장: ./data/defalut_original_train_before_2017_label_defalut/defalut_original_train_before_2017_label_defalut_encoding.csv


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,home_ownership,annual_inc,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,27600.0,27600.0,27600.0,0,19.97,730.78,0,0,0,73000.0,...,87.5,80.0,0.0,0.0,261675.0,37808.0,19200.0,14075.0,0,0
1,8000.0,8000.0,8000.0,1,10.99,261.88,1,1,0,33000.0,...,100.0,75.0,1.0,0.0,33226.0,15949.0,8200.0,12426.0,0,0
2,10000.0,10000.0,10000.0,0,13.98,232.58,2,2,1,25000.0,...,68.2,100.0,0.0,0.0,40142.0,24419.0,3000.0,28642.0,0,0
3,10400.0,10400.0,10400.0,0,17.57,261.67,0,3,1,105680.0,...,100.0,100.0,0.0,0.0,213593.0,66793.0,9600.0,56993.0,0,0
4,18450.0,18450.0,18450.0,1,13.98,630.4,2,2,1,65000.0,...,96.9,37.5,0.0,0.0,63402.0,23136.0,28100.0,21002.0,0,0


In [9]:
dataset = pd.read_csv("data/defalut_original_train_before_2017_label_defalut/defalut_original_train_before_2017_label_defalut_encoding.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287765 entries, 0 to 287764
Data columns (total 65 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   loan_amnt                   287765 non-null  float64
 1   funded_amnt                 287765 non-null  float64
 2   funded_amnt_inv             287765 non-null  float64
 3   term                        287765 non-null  int64  
 4   int_rate                    287765 non-null  float64
 5   installment                 287765 non-null  float64
 6   grade                       287765 non-null  int64  
 7   sub_grade                   287765 non-null  int64  
 8   home_ownership              287765 non-null  int64  
 9   annual_inc                  287765 non-null  float64
 10  verification_status         287765 non-null  int64  
 11  zip_code                    287765 non-null  float64
 12  addr_state                  287765 non-null  int64  
 13  dti           

In [10]:
for col in dataset.columns:
    print(col, "\n", dataset[col].unique(),'\n')

loan_amnt 
 [27600.  8000. 10000. ... 35575. 38700. 35275.] 

funded_amnt 
 [27600.  8000. 10000. ... 35575. 38700. 35275.] 

funded_amnt_inv 
 [27600.  8000. 10000. ... 38700. 36625. 35275.] 

term 
 [0 1] 

int_rate 
 [19.97 10.99 13.98 17.57 18.25 21.48 23.4  25.89 15.61 14.47 19.22  8.9
 13.53 16.24 23.7  12.85 24.08 22.9  16.99 20.5  14.98 25.83  7.9   7.62
 11.99 22.4  24.5  24.99  9.67 25.8   6.03 25.57  6.62 25.99 13.67 15.1
 12.99 16.2  21.7  19.2  14.3  23.1  18.55 21.   17.1  19.52 17.76 20.2
 26.06  8.6   9.25  9.99  6.97 16.78 22.2  18.85 11.55 14.33  9.71 23.5
 24.89 21.15 15.88 10.64 12.35 13.68 17.56 22.7  15.22 20.8  20.31 13.05
 25.28 21.6  12.12 19.72 11.14 15.8  15.31 10.16 17.27 17.77 18.75 18.49
 24.7  23.83 13.11 22.95 23.28 16.29 22.47 19.05 21.98 20.49 14.09 21.49
 23.63 23.76 24.83  6.    7.97 17.09 28.72 30.84 24.85 19.03 14.08 20.
 18.06 15.05 12.62 10.42 16.02 30.89 29.69  9.93 21.45 30.17 10.91 30.65
 13.59  9.44  7.35 23.88 30.99 26.3  30.94 22.91  5.32 3