In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from scipy.stats import mode
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [2]:
data = pd.read_csv('/Users/philip_cho/Desktop/df1_loan 2.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,$5849.0
1,1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,$6091.0
2,2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,$3000.0
3,3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,$4941.0
4,4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,$6000.0


In [3]:
# input / output split + 불필요한 feature 삭제
X = data.drop(['Unnamed: 0', 'Loan_ID', 'Loan_Status'], axis=1)
y = data['Loan_Status']

# Total_Income float 변환, Credit_history object 변환
for i in range(len(X['Total_Income'])):
    X['Total_Income'][i] = X['Total_Income'][i].strip('$')
X = X.astype({'Total_Income':'float', 'Credit_History':'object'})
    

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             491 non-null    object 
 1   Married            497 non-null    object 
 2   Dependents         488 non-null    object 
 3   Education          500 non-null    object 
 4   Self_Employed      473 non-null    object 
 5   ApplicantIncome    500 non-null    int64  
 6   CoapplicantIncome  500 non-null    float64
 7   LoanAmount         482 non-null    float64
 8   Loan_Amount_Term   486 non-null    float64
 9   Credit_History     459 non-null    object 
 10  Property_Area      500 non-null    object 
 11  Total_Income       500 non-null    float64
dtypes: float64(4), int64(1), object(7)
memory usage: 47.0+ KB


In [5]:
# 결측치 처리1 - 평균(수치형), 최빈값(범주형) 처리
def meanMode(X):
    
    # 연속형 피처 리스트 저장
    num_col = list(X._get_numeric_data().columns)

    # 범주형 피처 리스트 저장
    cat_col = list(X.columns)
    for col in num_col:
        cat_col.remove(col)

    # 연속형 데이터 전처리(평균값으로 채우기)
    for col in num_col:
        X[col].fillna(round(np.mean(X[col])), inplace=True)

    # 범주형 데이터 전처리 (최빈값으로 채우기)
    for col in cat_col:
        X[col].fillna(mode(X[col])[0][0], inplace=True)
        
    return X
        
    
# 결측치 처리2 - 결측행 제거
def delNaN(X):
    X = X.dropna(how='any', axis=0).reset_index()
    return X

In [6]:
# 인코딩1 - 레이블인코딩
def LabelEncode(X):
    
    # 연속형 피처 리스트 저장
    num_col = list(X._get_numeric_data().columns)

    # 범주형 피처 리스트 저장
    cat_col = list(X.columns)
    for col in num_col:
        cat_col.remove(col)
        
    encoder = LabelEncoder()
    for cat in cat_col:
        X[cat] = encoder.fit_transform(X[cat])

    return X

# 인코딩2 - 원핫인코딩
def OneHotEncode(X):
    object_col = []
    for col in X.columns:
        if X[col].dtype == 'object':
            object_col.append(col)

    enc = OneHotEncoder()
    enc.fit(X.loc[:,object_col])

    onehot_X = pd.DataFrame(enc.transform(X.loc[:,object_col]).toarray(), 
                                   columns = enc.get_feature_names(object_col))
    X.drop(object_col, axis=1, inplace=True)
    X = pd.concat([X, onehot_X], axis=1)
    
    return X


### [결측치 메소드]
- meanMode(dataFrame): 범주(최빈값), 수치(평균값)
- delNaN(dataFrame): NaN행 삭제
 
 
### [ 인코딩 메소드]
- LabelEncode(dataFrame)
- OneHotEncode(dataFrame)

 
※ 결측치 제거 후 인코딩 진행할 것