# XGBoost 모델링
1. 대용량 데이터에는 트랜스포머가 좋은 성능을 보이지만, 중소형 데이터 셋인 경우 ML 기반의 모델이 높은 성능을 보이기도 하기 때문에, 트랜스포머와 병행하여 수행
2. 트랜스포머, loan_amt_pred와 동일한 데이터셋 사용

## Data Import

In [1]:
import pandas as pd
Data_EDA = pd.read_csv('../../Data/home-credit-default-risk/application_train.csv',index_col='SK_ID_CURR')

## Data columns 분류

In [2]:
import pandas as pd
import numpy as np
import pandas.api.types as ptypes

threshold = 10

cat_cols_by_cardinality = []
num_cols_by_cardinality = []

for col in Data_EDA.columns:

    # 고유값 개수 확인
    unique_count = Data_EDA[col].nunique()
    
    if 'flag' in col.lower():
        cat_cols_by_cardinality.append(col)
    elif 'amt' in col.lower():
        num_cols_by_cardinality.append(col)

    # 실수형/정수형이면 일단 수치형으로 분류하되, 
    # 만약 유니크 값이 작은 범주 느낌이라면 cat_cols_by_cardinality 로 옮길 수도 있음
    elif ptypes.is_numeric_dtype(Data_EDA[col]):
        if unique_count < threshold:
            cat_cols_by_cardinality.append(col)
        else:
            num_cols_by_cardinality.append(col)
    else:
        # 문자인 경우 범주형으로 분류
        cat_cols_by_cardinality.append(col)

print("범주형(유니크값 < 10):", cat_cols_by_cardinality)
print("수치형(유니크값 >= 10):", num_cols_by_cardinality)


범주형(유니크값 < 10): ['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16

## 결측치도 의미가 있을 수 있으므로, 보간하지 않고, MISSING으로 처리

In [None]:
for c in cat_cols_by_cardinality:
    Data_EDA[c] = Data_EDA[c].fillna("MISSING")

## 범주형 정보 레이블 인코딩

In [3]:
# 범주형 컬럼 List에서 Target 컬럼을 제외함
cat_cols_by_cardinality_less_target = [x for x in cat_cols_by_cardinality if x != 'TARGET']

print(cat_cols_by_cardinality_less_target) 

['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FL

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# LabelEncoder를 보관할 딕셔너리 (각 컬럼별로 학습된 encoder를 저장)
Data_EDA_labeled = Data_EDA.copy()
encoders = {}
# 각 범주형 컬럼에 대해 LabelEncoder로 변환
for col in cat_cols_by_cardinality_less_target:
    le = LabelEncoder()
    Data_EDA_labeled[col] = le.fit_transform(Data_EDA[col])
    encoders[col] = le

## 수치형 데이터 스케일링

### KNN Imputer 
- 이 노트북으로는 실행이 안됨

In [None]:
from sklearn.impute import KNNImputer  # KNNImputer

num_data = Data_EDA[num_cols_by_cardinality].values  # (300000, 45)

# KNNImputer 생성 (예: n_neighbors=5)
knn_imputer = KNNImputer(n_neighbors=5)
num_data_imputed = knn_imputer.fit_transform(num_data)  # 결측치 보간

# Imputed 결과를 df에 반영
Data_EDA_labeled[num_cols_by_cardinality] = num_data_imputed


### simple imputer 이후 score 측정

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

# 예시: 평균 대체
imputer = SimpleImputer(strategy="median") # "mean", "median", "most_frequent", "constant" 옵션 존재

# fit_transform으로 결측치 보간
num_data = Data_EDA[num_cols_by_cardinality].values  # (300000, 45)

num_data_imputed = imputer.fit_transform(num_data)
Data_EDA_labeled[num_cols_by_cardinality] = num_data_imputed
print("Before:")
print(num_data)
print("After:")
print(num_data_imputed)


In [None]:
import pandas as pd
from sklearn.preprocessing import RobustScaler


# 로버스트 스케일러 초기화 (중앙값과 IQR을 사용)
scaler = RobustScaler()

# 대출 잔액 컬럼에 로버스트 스케일링 적용
Data_EDA_scaled = Data_EDA_labeled.copy()
Data_EDA_scaled[num_cols_by_cardinality] = scaler.fit_transform(Data_EDA_labeled[num_cols_by_cardinality])

print("스케일링 전 데이터:")
print(Data_EDA_labeled[num_cols_by_cardinality])
print("\n스케일링 후 데이터:")
print(Data_EDA_scaled[num_cols_by_cardinality])


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer  # 간단히 평균/중앙값 처리 예시
from sklearn.metrics import accuracy_score

# XGBoost
import xgboost as xgb

# 데이터 분리
X = Data_EDA_scaled.drop(columns=['TARGET'])
y = Data_EDA_scaled['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)

In [None]:
model_s = xgb.XGBClassifier(
    n_estimators=1500,     
    max_depth=8,
    learning_rate=0.05,   
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=99,  
    random_state=42,
    n_jobs=-1
)

In [None]:
model_s.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score

y_pred = model.predict(X_train)
acc = accuracy_score(y_train, y_pred)
print("XGBoost Test Accuracy:", acc)

# 모델 예측 확률 출력
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))

# 원하는 임계값(예: 0.3)으로 양성/음성 결정
threshold = 0.7
y_pred_custom = (y_pred_proba >= threshold).astype(int)

# 이후 y_pred_custom으로 정밀도, 재현율, f1_score 측정
print("f1 score:", f1_score(y_test, y_pred_custom))

### MICE imputer 이후 score 측정

In [5]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler

num_data = Data_EDA[num_cols_by_cardinality].values

imp = IterativeImputer(max_iter=10, random_state=42) #하이퍼 파라미터에 대하여 추가적으로 시험해볼 것
num_data_mice = imp.fit_transform(num_data)

Data_mice = Data_EDA_labeled.copy()
Data_mice[num_cols_by_cardinality] = num_data_mice

scaler = RobustScaler()

Data_EDA_scaled_mice = Data_mice.copy()
Data_EDA_scaled_mice[num_cols_by_cardinality] = scaler.fit_transform(Data_mice[num_cols_by_cardinality])


In [36]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer  # 간단히 평균/중앙값 처리 예시
from sklearn.metrics import accuracy_score

# XGBoost
import xgboost as xgb

# 데이터 분리
X_m = Data_EDA_scaled_mice.drop(columns=['TARGET'])
y_m = Data_EDA_scaled_mice['TARGET']
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y_m, test_size=0.2, random_state=42)
print(X_train_m.shape)

model_m = xgb.XGBClassifier(
    n_estimators=2500,
    max_depth=6,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

(246008, 120)


In [37]:
model_m.fit(X_train_m, y_train_m)

In [38]:
from sklearn.metrics import roc_auc_score, f1_score

y_pred_m = model_m.predict(X_train_m)
acc = accuracy_score(y_train_m, y_pred_m)
print("XGBoost Test Accuracy:", acc)

# 모델 예측 확률 출력
y_pred_m_proba = model_m.predict_proba(X_test_m)[:, 1]
print("ROC AUC:", roc_auc_score(y_test_m, y_pred_m_proba))

XGBoost Test Accuracy: 0.922811453286072
ROC AUC: 0.7616877620484468


In [39]:
for i in range(1,100):
    r = i/100
    threshold = r
    y_pred_m_custom = (y_pred_m_proba >=threshold).astype(int)
    print("f1 score: ", f1_score(y_test_m, y_pred_m_custom))

f1 score:  0.152804193533423
f1 score:  0.1687693032507983
f1 score:  0.1890640615121046
f1 score:  0.20896019805751284
f1 score:  0.22672487164703622
f1 score:  0.24196727466132717
f1 score:  0.2558320373250389
f1 score:  0.2692964006508195
f1 score:  0.2803886925795053
f1 score:  0.28867483946293054
f1 score:  0.2968400893712097
f1 score:  0.3008624182439081
f1 score:  0.3032699428858344
f1 score:  0.3069098799755883
f1 score:  0.31079495453212086
f1 score:  0.3074980268350434
f1 score:  0.3010770784247728
f1 score:  0.29668661248548717
f1 score:  0.29357798165137616
f1 score:  0.2887359074129502
f1 score:  0.2835522858644246
f1 score:  0.27547419497132775
f1 score:  0.2695982502590077
f1 score:  0.26252408477842004
f1 score:  0.2506573181419807
f1 score:  0.2438011164481371
f1 score:  0.23635385856413013
f1 score:  0.22628951747088186
f1 score:  0.21531853972798853
f1 score:  0.20512820512820512
f1 score:  0.1958762886597938
f1 score:  0.18593822753375758
f1 score:  0.17529374404572

- 2차 시도 : scale_pos_weight를 주게 되면, AR이 급속히 떨어진다
- 차라리 부도 판정 threshold를 낮추는 것이 효과적

In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer  # 간단히 평균/중앙값 처리 예시
from sklearn.metrics import accuracy_score

# XGBoost
import xgboost as xgb

# 데이터 분리
X_m = Data_EDA_scaled_mice.drop(columns=['TARGET'])
y_m = Data_EDA_scaled_mice['TARGET']
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y_m, test_size=0.2, random_state=42)
print(X_train_m.shape)

model_m = xgb.XGBClassifier(
    n_estimators=1500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=70,  
    random_state=42,
    n_jobs=-1
)

(246008, 120)


In [22]:
model_m.fit(X_train_m, y_train_m)

In [23]:
from sklearn.metrics import roc_auc_score, f1_score

y_pred_m = model_m.predict(X_train_m)
acc = accuracy_score(y_train_m, y_pred_m)
print("XGBoost Test Accuracy:", acc)

# 모델 예측 확률 출력
y_pred_m_proba = model_m.predict_proba(X_test_m)[:, 1]
print("ROC AUC:", roc_auc_score(y_test_m, y_pred_m_proba))

XGBoost Test Accuracy: 0.5675872329355143
ROC AUC: 0.7324219219080916


In [28]:
for i in range(1,100):
    r = i/100
    threshold = r
    y_pred_m_custom = (y_pred_m_proba >=threshold).astype(int)
    print("f1 score: ", f1_score(y_test_m, y_pred_m_custom))

f1 score:  0.1512775262794445
f1 score:  0.15278382581648522
f1 score:  0.15394546256061464
f1 score:  0.1552476508998248
f1 score:  0.15616301707077695
f1 score:  0.1575069531415188
f1 score:  0.15886161227776135
f1 score:  0.1598871462949133
f1 score:  0.16110980799517474
f1 score:  0.1624111073484592
f1 score:  0.1636298311348355
f1 score:  0.16450949558485545
f1 score:  0.16566312372375516
f1 score:  0.1667401230541403
f1 score:  0.1678505937438801
f1 score:  0.16914832398216084
f1 score:  0.1704679691049523
f1 score:  0.1712996091814829
f1 score:  0.17237351921543909
f1 score:  0.17362958522237626
f1 score:  0.17516478521100085
f1 score:  0.17641998202917392
f1 score:  0.17738787656277175
f1 score:  0.17853032401537172
f1 score:  0.17928153388377638
f1 score:  0.18039309302140527
f1 score:  0.18138319260152794
f1 score:  0.1827270879902459
f1 score:  0.18367808064416738
f1 score:  0.1850075771729879
f1 score:  0.18612457255543668
f1 score:  0.18779442346051012
f1 score:  0.1892042

## 하이퍼파라미터 조정
- pos_weight 삭제
- learning rate 낮추기

- 3차 시도

In [40]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer  # 간단히 평균/중앙값 처리 예시
from sklearn.metrics import accuracy_score

# XGBoost
import xgboost as xgb

# 데이터 분리
X_m = Data_EDA_scaled_mice.drop(columns=['TARGET'])
y_m = Data_EDA_scaled_mice['TARGET']
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y_m, test_size=0.2, random_state=42)
print(X_train_m.shape)

model_m = xgb.XGBClassifier(
    n_estimators=3500,
    max_depth=7,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

(246008, 120)


In [41]:
model_m.fit(X_train_m, y_train_m)

In [42]:
from sklearn.metrics import roc_auc_score, f1_score

y_pred_m = model_m.predict(X_train_m)
acc = accuracy_score(y_train_m, y_pred_m)
print("XGBoost Test Accuracy:", acc)

# 모델 예측 확률 출력
y_pred_m_proba = model_m.predict_proba(X_test_m)[:, 1]
print("ROC AUC:", roc_auc_score(y_test_m, y_pred_m_proba))

XGBoost Test Accuracy: 0.9303803128353549
ROC AUC: 0.7599114050631217


In [43]:
for i in range(1,100):
    r = i/100
    threshold = r
    y_pred_m_custom = (y_pred_m_proba >=threshold).astype(int)
    print("f1 score: ", f1_score(y_test_m, y_pred_m_custom))

f1 score:  0.15637755507202644
f1 score:  0.17581973239229526
f1 score:  0.19602929421676857
f1 score:  0.21593312518884078
f1 score:  0.23284136051265694
f1 score:  0.24706115440496632
f1 score:  0.26102831594634873
f1 score:  0.27262122979503417
f1 score:  0.2842901975263061
f1 score:  0.29191000964907826
f1 score:  0.29763683657092255
f1 score:  0.30287678668355345
f1 score:  0.30467216332940716
f1 score:  0.30598013664858775
f1 score:  0.30708661417322836
f1 score:  0.30337898063366014
f1 score:  0.30019863546074793
f1 score:  0.2942411243953637
f1 score:  0.29056203605514314
f1 score:  0.28591735201543306
f1 score:  0.2822098976109215
f1 score:  0.27391742195367574
f1 score:  0.26741337066853343
f1 score:  0.2610813443740867
f1 score:  0.2515228426395939
f1 score:  0.24344176285414482
f1 score:  0.23316202326210442
f1 score:  0.22200529912146144
f1 score:  0.21181204196005174
f1 score:  0.2045689019896831
f1 score:  0.1958653991247925
f1 score:  0.187335500851525
f1 score:  0.1760