In [2]:
import pandas as pd
import numpy as np
from scipy.stats import mode
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [4]:
data = pd.read_csv('/Users/philip_cho/Desktop/df1_loan 2.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,$5849.0
1,1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,$6091.0
2,2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,$3000.0
3,3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,$4941.0
4,4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,$6000.0


In [5]:
# input / output split
X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']

# EDA BaseLine
### key point: null값을 무엇으로 채울 것인지 고민 필요 

 - dependents: 부양가족
 - credit_history: 

- 원핫인코딩할지, 레이블인코딩 할지 고민 필요

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         500 non-null    int64  
 1   Loan_ID            500 non-null    object 
 2   Gender             491 non-null    object 
 3   Married            497 non-null    object 
 4   Dependents         488 non-null    object 
 5   Education          500 non-null    object 
 6   Self_Employed      473 non-null    object 
 7   ApplicantIncome    500 non-null    int64  
 8   CoapplicantIncome  500 non-null    float64
 9   LoanAmount         482 non-null    float64
 10  Loan_Amount_Term   486 non-null    float64
 11  Credit_History     459 non-null    float64
 12  Property_Area      500 non-null    object 
 13  Loan_Status        500 non-null    object 
 14  Total_Income       500 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 58.7+ KB


In [5]:
# 불필요 피처 삭제
X = X.drop(['Unnamed: 0', 'Loan_ID'], axis=1)

# Total_Income float 변환
for i in range(len(X['Total_Income'])):
    X['Total_Income'][i] = float(X['Total_Income'][i].strip('$'))

# 연속형 피처 리스트 저장
X._get_numeric_data().columns

num_col = list(X._get_numeric_data().columns)
num_col.remove('Credit_History')
num_col.append('Total_Income')

# 범주형 피처 리스트 저장
cat_col = list(X.columns)
for col in num_col:
    cat_col.remove(col)

# 연속형 데이터 전처리(평균값으로 채우기)
for col in num_col:
    X[col].fillna(round(np.mean(X[col])), inplace=True)

# 범주형 데이터 전처리 (최빈값으로 채우기)
for col in cat_col:
    X[col].fillna(mode(X[col])[0][0], inplace=True)

# 레이블인코딩
encoder = LabelEncoder()
for cat in cat_col:
    X[cat] = encoder.fit_transform(X[cat])

# num_col은 데이터 분포가 각각 다 다르므로 스케일링 방법 고민 필요함
# log scale (0값 inf 변환 막기 위해 log1p 사용)
# for num in num_col:
#     X[num] = np.log1p(X[num])

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             500 non-null    int64  
 1   Married            500 non-null    int64  
 2   Dependents         500 non-null    int64  
 3   Education          500 non-null    int64  
 4   Self_Employed      500 non-null    int64  
 5   ApplicantIncome    500 non-null    int64  
 6   CoapplicantIncome  500 non-null    float64
 7   LoanAmount         500 non-null    float64
 8   Loan_Amount_Term   500 non-null    float64
 9   Credit_History     500 non-null    int64  
 10  Property_Area      500 non-null    int64  
 11  Total_Income       500 non-null    float64
dtypes: float64(4), int64(8)
memory usage: 47.0 KB


# Modeling

#### [사용 할만한 모델]
 - logistic regression
 - Decision tree
 - xgboost
 - svm

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25)

In [8]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
y_preds = logistic.predict(X_test)

# Evaluating

In [9]:
print('Accuracy Score: {:0.3f}'.format(accuracy_score(y_test, y_preds)))

Accuracy Score: 0.800


# Visualize