In [1]:
import pandas as pd
import pickle
import os

In [2]:
%cd "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/tablegan" 

C:\Users\GCU\Lending_club\Data_Analysis_lending-club\tablegan


In [3]:
def cat_to_num(df, sep=',', cat_names=None):
    if cat_names is None: cat_names = []
    subs = {}
    df_num = df.copy()
    
    # TRANSFORM TO SET TO PREVENT DOUBLE FACTORIZATION
    for z in set(df_num.select_dtypes(include=['object']).columns.tolist() + cat_names):
        y, label = pd.factorize(df[z])
        subs[z] = {'y': y, 'label': label}
        df_num[z] = y
    return df_num, subs

In [4]:
def transform(ds: str, suffix: str = '', sep: str = ',', ext: str = 'csv',
              drop: list = None, cat_names: list = None, to_disk: bool = True,
              target: str = '', d_basepath: str = 'data'):
    
    if target == '':
        target = False
    if drop is None:
        drop = []
    if cat_names is None:
        cat_names = []

    base_fname = f'./{d_basepath}/{ds}/{ds}'
    source_fname = f'{base_fname}{suffix}.{ext}'
    print(f'Basepath: {base_fname}')
    print(f'Source file: {source_fname}')

    # ✅ CSV 파일 읽기 및 오류 처리
    try:
        df = pd.read_csv(source_fname, sep=sep, quotechar='"', error_bad_lines=False)
    except pd.errors.ParserError as e:
        print(f"❌ CSV 파싱 오류: {e}")
        return None, None
    except Exception as e:
        print(f"❌ CSV 파일 로드 중 오류 발생: {e}")
        return None, None

    # ✅ 문자열 내 쉼표 및 리스트 형태 처리
    def convert_to_number(value):
        if isinstance(value, str):
            value = value.strip().replace('"', '')
            try:
                # 문자열 리스트 처리: "[1.0, 2.0]" → [1.0, 2.0]
                if value.startswith('[') and value.endswith(']'):
                    return ast.literal_eval(value)
                return float(value)  # 일반 숫자 문자열 처리
            except Exception:
                return value  # 변환 실패 시 원래 값 반환
        return value

    df = df.applymap(convert_to_number)

    # ✅ 열 삭제
    df = df.drop(drop, axis=1, errors='ignore')

    # ✅ 범주형 변수 숫자 변환
    try:
        df_num, subs = cat_to_num(df, cat_names=cat_names)
    except Exception as e:
        print(f"❌ cat_to_num 함수 오류: {e}")
        return None, None

    # ✅ 변환 정보 저장
    try:
        with open(f'{d_basepath}/{ds}/subs.pkl', 'wb') as f:
            pickle.dump(subs, f)
    except Exception as e:
        print(f"❌ subs.pkl 저장 오류: {e}")

    # ✅ 타겟 분리 및 저장
    if target:
        y = df_num[target]
        df_num = df_num.drop([target], axis=1)

    if to_disk:
        if target:
            target_fname_y = f'{base_fname}_labels.csv'
            print(f'✅ 타겟 레이블 파일 저장: {target_fname_y}')
            y.to_csv(target_fname_y, sep=',', index=False)
        
        target_fname = f'{base_fname}_encoding.csv'
        print(f'✅ 변환된 데이터 저장: {target_fname}')
        df_num.to_csv(target_fname, sep=',', index=False)

    if target:
        return df_num, y, subs
    return df_num, subs


In [5]:
cat_names = ['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status',
       'addr_state', 'initial_list_status', 'application_type',
       'hardship_flag', 'debt_settlement_flag', 'loan_status']

In [6]:
a, b, c = transform('defalut_original_train_before_2017', suffix='', to_disk=True, target='loan_status',cat_names=cat_names)
a.head()

Basepath: ./data/defalut_original_train_before_2017/defalut_original_train_before_2017
Source file: ./data/defalut_original_train_before_2017/defalut_original_train_before_2017.csv




  """Entry point for launching an IPython kernel.


✅ 타겟 레이블 파일 저장: ./data/defalut_original_train_before_2017/defalut_original_train_before_2017_labels.csv
✅ 변환된 데이터 저장: ./data/defalut_original_train_before_2017/defalut_original_train_before_2017_encoding.csv


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,home_ownership,annual_inc,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,12000.0,12000.0,12000.0,0,10.99,392.81,0,0,0,60000.0,...,100.0,0.0,0.0,0.0,29700.0,7137.0,18100.0,0.0,0,0
1,4800.0,4800.0,4800.0,0,10.99,157.13,0,0,1,39600.0,...,100.0,0.0,0.0,0.0,25700.0,4136.0,25700.0,0.0,0,0
2,27050.0,27050.0,27050.0,0,10.99,885.46,0,0,2,55000.0,...,100.0,25.0,0.0,0.0,138554.0,70186.0,35700.0,33054.0,0,0
3,12000.0,12000.0,12000.0,0,7.62,373.94,1,1,1,96500.0,...,100.0,100.0,0.0,0.0,233004.0,46738.0,14800.0,53404.0,0,0
4,14000.0,14000.0,14000.0,0,12.85,470.71,0,2,0,88000.0,...,78.6,100.0,1.0,0.0,31840.0,17672.0,3900.0,27340.0,0,0


In [7]:
dataset = pd.read_csv("data/defalut_original_train_before_2017/defalut_original_train_before_2017_encoding.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1495022 entries, 0 to 1495021
Data columns (total 65 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   loan_amnt                   1495022 non-null  float64
 1   funded_amnt                 1495022 non-null  float64
 2   funded_amnt_inv             1495022 non-null  float64
 3   term                        1495022 non-null  int64  
 4   int_rate                    1495022 non-null  float64
 5   installment                 1495022 non-null  float64
 6   grade                       1495022 non-null  int64  
 7   sub_grade                   1495022 non-null  int64  
 8   home_ownership              1495022 non-null  int64  
 9   annual_inc                  1495022 non-null  float64
 10  verification_status         1495022 non-null  int64  
 11  zip_code                    1495022 non-null  float64
 12  addr_state                  1495022 non-null  int64  
 1

In [8]:
for col in dataset.columns:
    print(col, "\n", dataset[col].unique(),'\n')

loan_amnt 
 [12000.  4800. 27050. ... 39225. 35675. 38350.] 

funded_amnt 
 [12000.  4800. 27050. ... 39225. 35675. 38350.] 

funded_amnt_inv 
 [12000.  4800. 27050. ... 38925. 39225. 35675.] 

term 
 [0 1] 

int_rate 
 [10.99  7.62 12.85  6.62  8.9  16.24 11.99 19.97 14.98 13.53 13.98 22.9
 14.47  9.67 17.57 21.48 18.25 20.5  19.22  6.03 15.61  7.9  22.4  23.7
 23.4  16.99 25.89 25.8  24.08 24.99 25.99 25.57 26.06 24.5  25.83  6.
 17.76 21.7  16.2  13.67 15.1  18.55 19.2  14.3  17.1  19.52 20.2  12.99
 21.   23.1   9.99  6.97  8.6   9.25 15.88 13.68  9.71 11.55 22.7  14.33
 16.78 17.56 15.22 21.6  22.2  18.85 13.05 10.64 12.35 21.15 23.5  24.89
 20.8  20.31 25.28 11.14 12.12 19.72 23.76 15.8  13.11 14.09 15.31 18.75
 19.05 10.16 17.27 16.29 17.77 23.28 18.49 20.49 21.98 24.7  23.83 23.63
 22.95 22.47 21.49 24.83  7.97  9.44 13.59 16.02 23.88 17.09 28.72 30.84
 24.85 21.45 19.03 12.62  7.07 14.08 10.91 15.05 20.    5.32 18.06  9.93
  7.35 30.65  7.21 10.42 30.89 29.69 30.17 26.3  25.82