In [1]:
import pandas as pd
import pickle
import os

In [2]:
%cd "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/tablegan" 

C:\Users\GCU\Lending_club\Data_Analysis_lending-club\tablegan


In [3]:
def cat_to_num(df, sep=',', cat_names=None):
    if cat_names is None: cat_names = []
    subs = {}
    df_num = df.copy()
    
    # TRANSFORM TO SET TO PREVENT DOUBLE FACTORIZATION
    for z in set(df_num.select_dtypes(include=['object']).columns.tolist() + cat_names):
        y, label = pd.factorize(df[z])
        subs[z] = {'y': y, 'label': label}
        df_num[z] = y
    return df_num, subs

In [4]:
def transform(ds: str, suffix: str = '', sep: str = ',', ext: str = 'csv',
              drop: list = None, cat_names: list = None, to_disk: bool = True,
              target: str = '', d_basepath: str = 'data'):
    
    if target == '':
        target = False
    if drop is None:
        drop = []
    if cat_names is None:
        cat_names = []

    base_fname = f'./{d_basepath}/{ds}/{ds}'
    source_fname = f'{base_fname}{suffix}.{ext}'
    print(f'Basepath: {base_fname}')
    print(f'Source file: {source_fname}')

    # ✅ CSV 파일 읽기 및 오류 처리
    try:
        df = pd.read_csv(source_fname, sep=sep, quotechar='"', error_bad_lines=False)
    except pd.errors.ParserError as e:
        print(f"❌ CSV 파싱 오류: {e}")
        return None, None
    except Exception as e:
        print(f"❌ CSV 파일 로드 중 오류 발생: {e}")
        return None, None

    # ✅ 문자열 내 쉼표 및 리스트 형태 처리
    def convert_to_number(value):
        if isinstance(value, str):
            value = value.strip().replace('"', '')
            try:
                # 문자열 리스트 처리: "[1.0, 2.0]" → [1.0, 2.0]
                if value.startswith('[') and value.endswith(']'):
                    return ast.literal_eval(value)
                return float(value)  # 일반 숫자 문자열 처리
            except Exception:
                return value  # 변환 실패 시 원래 값 반환
        return value

    df = df.applymap(convert_to_number)

    # ✅ 열 삭제
    df = df.drop(drop, axis=1, errors='ignore')

    # ✅ 범주형 변수 숫자 변환
    try:
        df_num, subs = cat_to_num(df, cat_names=cat_names)
    except Exception as e:
        print(f"❌ cat_to_num 함수 오류: {e}")
        return None, None

    # ✅ 변환 정보 저장
    try:
        with open(f'{d_basepath}/{ds}/subs.pkl', 'wb') as f:
            pickle.dump(subs, f)
    except Exception as e:
        print(f"❌ subs.pkl 저장 오류: {e}")

    # ✅ 타겟 분리 및 저장
    if target:
        y = df_num[target]
        df_num = df_num.drop([target], axis=1)

    if to_disk:
        if target:
            target_fname_y = f'{base_fname}_labels.csv'
            print(f'✅ 타겟 레이블 파일 저장: {target_fname_y}')
            y.to_csv(target_fname_y, sep=',', index=False)
        
        target_fname = f'{base_fname}_encoding.csv'
        print(f'✅ 변환된 데이터 저장: {target_fname}')
        df_num.to_csv(target_fname, sep=',', index=False)

    if target:
        return df_num, y, subs
    return df_num, subs


In [5]:
cat_names = ['loan_status']

In [6]:
a, b, c = transform('train', suffix='', to_disk=True, target='loan_status',cat_names=cat_names)
a.head()

Basepath: ./data/train/train
Source file: ./data/train/train.csv




  """Entry point for launching an IPython kernel.


✅ 타겟 레이블 파일 저장: ./data/train/train_labels.csv
✅ 변환된 데이터 저장: ./data/train/train_encoding.csv


Unnamed: 0,last_fico_range_high,int_rate,mo_sin_old_rev_tl_op,dti,annual_inc,zip_code,installment,fico_range_low,revol_util,total_acc,total_bc_limit,revol_bal,total_rev_hi_lim,bc_open_to_buy,credit_history_years
0,539.0,0.0532,155.0,9.74,96000.0,67.0,361.38,795.0,0.119,29.0,90200.0,11201.0,94200.0,78999.0,22
1,674.0,0.1999,177.0,38.16,55000.0,657.0,408.75,660.0,0.311,27.0,28300.0,16198.0,52000.0,13998.0,15
2,529.0,0.1824,26.0,32.2,18000.0,207.0,181.37,665.0,0.961,34.0,500.0,2210.0,2300.0,30.0,11
3,559.0,0.1699,187.0,12.26,75000.0,105.0,1176.38,695.0,0.038,18.0,0.0,580.0,15400.0,11065.920936,15
4,639.0,0.1399,288.0,15.05,92000.0,115.0,409.43,665.0,0.901,40.0,13000.0,18021.0,20000.0,486.0,24


In [7]:
dataset = pd.read_csv("data/train/train.csv")
display(len(dataset))

290385

In [8]:
for col in dataset.columns:
    print(col, "\n", dataset[col].unique(),'\n')

last_fico_range_high 
 [699. 644. 634. 684. 649. 734. 759. 769. 619. 704. 729. 544. 539. 674.
 664. 739. 529. 689. 744. 714. 749. 679. 574. 724. 669. 609. 784. 659.
 779. 559. 719. 639. 629. 589. 499. 569. 819. 774. 814. 654. 514. 709.
 624. 694. 789. 799. 824. 764. 549. 604. 754. 839. 834. 599. 594. 524.
 584. 504. 614. 554. 534. 794. 509. 809. 564. 804. 579. 519. 829. 844.
   0. 850.] 

int_rate 
 [0.1099 0.1786 0.1033 0.1359 0.0818 0.162  0.1797 0.0646 0.0917 0.1449
 0.0624 0.1602 0.079  0.0532 0.1999 0.2025 0.1149 0.1399 0.0762 0.1824
 0.1114 0.235  0.1757 0.0789 0.139  0.1825 0.1875 0.1274 0.1212 0.1431
 0.1408 0.1229 0.1259 0.1699 0.1311 0.21   0.1614 0.0699 0.1041 0.1531
 0.1398 0.1269 0.0735 0.1899 0.1624 0.1239 0.0963 0.1952 0.0799 0.1499
 0.2245 0.1199 0.1502 0.1335 0.1016 0.2199 0.1599 0.1559 0.1056 0.1091
 0.0797 0.0824 0.1629 0.1147 0.2185 0.1262 0.0739 0.1464 0.1139 0.0668
 0.0689 0.0992 0.1756 0.0859 0.1774 0.1561 0.1349 0.0531 0.2099 0.0899
 0.1849 0.2145 0.0819 0.1695 