In [1]:
import pandas as pd
import pickle
import os

In [2]:
%cd "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/tablegan" 

C:\Users\GCU\Lending_club\Data_Analysis_lending-club\tablegan


In [3]:
def cat_to_num(df, sep=',', cat_names=None):
    if cat_names is None: cat_names = []
    subs = {}
    df_num = df.copy()
    
    # TRANSFORM TO SET TO PREVENT DOUBLE FACTORIZATION
    for z in set(df_num.select_dtypes(include=['object']).columns.tolist() + cat_names):
        y, label = pd.factorize(df[z])
        subs[z] = {'y': y, 'label': label}
        df_num[z] = y
    return df_num, subs

In [4]:
def transform(ds: str, suffix: str = '', sep: str = ',', ext: str = 'csv',
              drop: list = None, cat_names: list = None, to_disk: bool = True,
              target: str = '', d_basepath: str = 'data'):
    
    if target == '':
        target = False
    if drop is None:
        drop = []
    if cat_names is None:
        cat_names = []

    base_fname = f'./{d_basepath}/{ds}/{ds}'
    source_fname = f'{base_fname}{suffix}.{ext}'
    print(f'Basepath: {base_fname}')
    print(f'Source file: {source_fname}')

    # ✅ CSV 파일 읽기 및 오류 처리
    try:
        df = pd.read_csv(source_fname, sep=sep, quotechar='"', error_bad_lines=False)
    except pd.errors.ParserError as e:
        print(f"❌ CSV 파싱 오류: {e}")
        return None, None
    except Exception as e:
        print(f"❌ CSV 파일 로드 중 오류 발생: {e}")
        return None, None

    # ✅ 문자열 내 쉼표 및 리스트 형태 처리
    def convert_to_number(value):
        if isinstance(value, str):
            value = value.strip().replace('"', '')
            try:
                # 문자열 리스트 처리: "[1.0, 2.0]" → [1.0, 2.0]
                if value.startswith('[') and value.endswith(']'):
                    return ast.literal_eval(value)
                return float(value)  # 일반 숫자 문자열 처리
            except Exception:
                return value  # 변환 실패 시 원래 값 반환
        return value

    df = df.applymap(convert_to_number)

    # ✅ 열 삭제
    df = df.drop(drop, axis=1, errors='ignore')

    # ✅ 범주형 변수 숫자 변환
    try:
        df_num, subs = cat_to_num(df, cat_names=cat_names)
    except Exception as e:
        print(f"❌ cat_to_num 함수 오류: {e}")
        return None, None

    # ✅ 변환 정보 저장
    try:
        with open(f'{d_basepath}/{ds}/subs.pkl', 'wb') as f:
            pickle.dump(subs, f)
    except Exception as e:
        print(f"❌ subs.pkl 저장 오류: {e}")

    # ✅ 타겟 분리 및 저장
    if target:
        y = df_num[target]
        df_num = df_num.drop([target], axis=1)

    if to_disk:
        if target:
            target_fname_y = f'{base_fname}_labels.csv'
            print(f'✅ 타겟 레이블 파일 저장: {target_fname_y}')
            y.to_csv(target_fname_y, sep=',', index=False)
        
        target_fname = f'{base_fname}_encoding.csv'
        print(f'✅ 변환된 데이터 저장: {target_fname}')
        df_num.to_csv(target_fname, sep=',', index=False)

    if target:
        return df_num, y, subs
    return df_num, subs


In [5]:
cat_names = ['loan_status']

In [6]:
a, b, c = transform('categorical', suffix='', to_disk=True, target='loan_status',cat_names=cat_names)
a.head()

Basepath: ./data/categorical/categorical
Source file: ./data/categorical/categorical.csv




  """Entry point for launching an IPython kernel.


✅ 타겟 레이블 파일 저장: ./data/categorical/categorical_labels.csv
✅ 변환된 데이터 저장: ./data/categorical/categorical_encoding.csv


Unnamed: 0,last_fico_range_high,annual_inc,dti,mo_sin_old_rev_tl_op,revol_util,int_rate,installment,avg_cur_bal,revol_bal,debt_settlement_flag,sub_grade,home_ownership,purpose,grade,total_pymnt,total_pymnt_inv,funded_amnt,loan_amnt,credit_history_years,term_months
0,604.0,67500.0,15.81,206.0,0.396,0.2449,492.45,6661.0,21088.0,0,0,0,0,0,12284.92,12284.92,16950.0,16950.0,17,60
1,559.0,46000.0,13.75,315.0,0.471,0.2049,636.04,1684.0,15158.0,0,1,0,1,0,19664.06,19664.06,17000.0,17000.0,26,36
2,499.0,85000.0,19.41,161.0,0.447,0.1824,306.3,6208.0,10057.0,0,2,1,0,1,3020.87,3020.87,12000.0,12000.0,13,60
3,664.0,60000.0,25.91,103.0,0.366,0.1825,543.15,10628.0,20224.0,0,3,1,1,1,3258.9,3258.9,21275.0,21275.0,9,60
4,649.0,195000.0,10.93,189.0,0.43,0.1561,482.23,24979.0,20745.0,0,4,0,1,2,7365.55,7365.55,20000.0,20000.0,15,60


In [13]:
dataset = pd.read_csv("data/categorical/categorical.csv")
display(len(dataset))

217789

In [14]:
for col in dataset.columns:
    print(col, "\n", dataset[col].unique(),'\n')

last_fico_range_high 
 [604. 559. 499. 664. 649. 544. 549. 529. 569. 594. 579. 599. 589. 709.
 539. 619. 629. 564. 534. 654. 674. 519. 524. 614. 644. 609. 514. 634.
 554. 639. 699. 574. 509. 659. 584. 504. 684. 694. 624. 669. 714. 679.
 719. 689. 724. 704. 764. 784. 779. 734. 749. 729. 789. 774. 794. 809.
 829. 739. 754. 819. 814. 834. 799. 744. 759. 804. 850.   0. 769. 824.
 844. 839.] 

annual_inc 
 [67500. 46000. 85000. ... 54078. 58596. 75388.] 

dti 
 [ 15.81  13.75  19.41 ...  56.6  131.58  54.48] 

mo_sin_old_rev_tl_op 
 [206.         315.         161.         103.         189.
 186.         159.         167.         268.         102.
 239.          50.         221.         519.         220.
  66.         131.         190.          77.         129.
 357.         125.         136.         188.         423.
 270.         138.         191.          80.         297.
 119.         113.         108.         107.         201.
 178.         216.         299.          15.          64.
 3