In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

# 1. 데이터 불러오기
train_data = pd.read_csv('train2.csv', encoding='cp949')
test_data = pd.read_csv('test2.csv', encoding='cp949')

# 2. 다운샘플링: 클래스 불균형 해소
df_majority = train_data[train_data['임신 성공 여부'] == 0]
df_minority = train_data[train_data['임신 성공 여부'] == 1]

df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# 3. 피처/타겟 분리: 제외할 칼럼 제거
cols_to_drop = ['ID', '시술 시기 코드', '시술 유형', '임신 성공 여부',
                '여성 주 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
                'IVF 시술 횟수', 'IVF 출산 횟수', '혼합된 난자 수',
                '동결 배아 사용 여부', '신선 배아 사용 여부']
X = df_downsampled.drop(columns=cols_to_drop)
y = df_downsampled['임신 성공 여부']

# 4. 학습/검증 데이터 분리 (Stratify 옵션 사용)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 5. Decision Tree 모델 하이퍼파라미터 튜닝 (GridSearchCV)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)
best_dt = grid_search.best_estimator_
print("Decision Tree 최적의 하이퍼파라미터:", grid_search.best_params_)

# 6. 검증 데이터 평가
y_pred = best_dt.predict(X_val)
y_prob = best_dt.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)
conf_mat = confusion_matrix(y_val, y_pred)

print("Accuracy: {:.4f}".format(accuracy))
print("ROC-AUC: {:.4f}".format(roc_auc))
print("Confusion Matrix:")
print(conf_mat)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Decision Tree 최적의 하이퍼파라미터: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Accuracy: 0.6585
ROC-AUC: 0.7117
Confusion Matrix:
[[10794  8828]
 [ 4574 15047]]


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

# 1. 데이터 불러오기
train_data = pd.read_csv('train2.csv', encoding='cp949')
test_data = pd.read_csv('test2.csv', encoding='cp949')

# 2. 다운샘플링: 클래스 불균형 해소
df_majority = train_data[train_data['임신 성공 여부'] == 0]
df_minority = train_data[train_data['임신 성공 여부'] == 1]

df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# 3. 피처/타겟 분리: 제외할 칼럼 제거
cols_to_drop = ['ID', '시술 시기 코드', '시술 유형', '임신 성공 여부',
                '여성 주 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
                'IVF 시술 횟수', 'IVF 출산 횟수', '혼합된 난자 수',
                '동결 배아 사용 여부', '신선 배아 사용 여부']
X = df_downsampled.drop(columns=cols_to_drop)
y = df_downsampled['임신 성공 여부']

# 4. 학습/검증 데이터 분리 (Stratify 옵션 사용)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 5. Decision Tree 모델 하이퍼파라미터 튜닝 (깊이 있는 탐색)
param_grid = {
    'criterion': ['entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [15, 20],
    'min_samples_split': [20, 40],
    'min_samples_leaf': [4, 8, 16],
    'max_features': [None, 'sqrt'],
    'max_leaf_nodes': [50],
    'min_impurity_decrease': [0.0, 0.001, 0.01],
    'ccp_alpha': [0.0, 0.001, 0.01]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_dt = grid_search.best_estimator_
print("Decision Tree 최적의 하이퍼파라미터:", grid_search.best_params_)

# 6. 검증 데이터 평가
y_pred = best_dt.predict(X_val)
y_prob = best_dt.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)
conf_mat = confusion_matrix(y_val, y_pred)

print("Accuracy: {:.4f}".format(accuracy))
print("ROC-AUC: {:.4f}".format(roc_auc))
print("Confusion Matrix:")
print(conf_mat)

Fitting 5 folds for each of 9720 candidates, totalling 48600 fits
Decision Tree 최적의 하이퍼파라미터: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
Accuracy: 0.6606
ROC-AUC: 0.7144
Confusion Matrix:
[[ 9707  9915]
 [ 3404 16217]]


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

# 1. 데이터 불러오기
train_data = pd.read_csv('train2.csv', encoding='cp949')
test_data = pd.read_csv('test2.csv', encoding='cp949')

# 2. 다운샘플링: 클래스 불균형 해소
df_majority = train_data[train_data['임신 성공 여부'] == 0]
df_minority = train_data[train_data['임신 성공 여부'] == 1]

df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# 3. 피처/타겟 분리: 제외할 칼럼 제거
cols_to_drop = ['ID', '시술 시기 코드', '시술 유형', '임신 성공 여부',
                '여성 주 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
                'IVF 시술 횟수', 'IVF 출산 횟수', '혼합된 난자 수',
                '동결 배아 사용 여부', '신선 배아 사용 여부']
X = df_downsampled.drop(columns=cols_to_drop)
y = df_downsampled['임신 성공 여부']

# 4. 학습/검증 데이터 분리 (Stratify 옵션 사용)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 5. Decision Tree 모델 하이퍼파라미터 튜닝 (깊이 있는 탐색)
param_grid = {
    'criterion': ['entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [15, 20],
    'min_samples_split': [20, 40],
    'min_samples_leaf': [4, 8, 16],
    'max_features': [None, 'sqrt'],
    'max_leaf_nodes': [50],
    'min_impurity_decrease': [0.0, 0.001, 0.01],
    'ccp_alpha': [0.0, 0.001, 0.01]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_dt = grid_search.best_estimator_
print("Decision Tree 최적의 하이퍼파라미터:", grid_search.best_params_)

# 6. 검증 데이터 평가
y_pred = best_dt.predict(X_val)
y_prob = best_dt.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)
conf_mat = confusion_matrix(y_val, y_pred)

print("Accuracy: {:.4f}".format(accuracy))
print("ROC-AUC: {:.4f}".format(roc_auc))
print("Confusion Matrix:")
print(conf_mat)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Decision Tree 최적의 하이퍼파라미터: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 15, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 16, 'min_samples_split': 20, 'splitter': 'best'}
Accuracy: 0.6606
ROC-AUC: 0.7144
Confusion Matrix:
[[ 9707  9915]
 [ 3404 16217]]


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

#####################################
# 1. 데이터 불러오기 및 전처리
#####################################
# 학습/테스트 데이터 로드 (cp949 인코딩)
train_data = pd.read_csv('train2.csv', encoding='cp949')
test_data = pd.read_csv('test2.csv', encoding='cp949')

# 다운샘플링: 클래스 불균형 해소 (임신 성공 여부 0 vs 1)
df_majority = train_data[train_data['임신 성공 여부'] == 0]
df_minority = train_data[train_data['임신 성공 여부'] == 1]
df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# 사용할 칼럼 외 제거 (불필요한 정보 배제)
cols_to_drop = ['ID', '시술 시기 코드', '시술 유형', '임신 성공 여부',
                '여성 주 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
                'IVF 시술 횟수', 'IVF 출산 횟수', '혼합된 난자 수',
                '동결 배아 사용 여부', '신선 배아 사용 여부']
X = df_downsampled.drop(columns=cols_to_drop)
y = df_downsampled['임신 성공 여부']

# 학습/검증 데이터 분리 (stratify로 클래스 비율 유지)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

#####################################
# 2. Model 1: Decision Tree (깊은 하이퍼파라미터 탐색)
#####################################
param_grid_dt = {
    'criterion': ['entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [15],
    'min_samples_split': [20],
    'min_samples_leaf': [16],
    'max_features': [None],
    'max_leaf_nodes': [50],
    'min_impurity_decrease': [0.0],
    'ccp_alpha': [0.0]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_dt,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_dt.fit(X_train, y_train)
best_dt = grid_search_dt.best_estimator_
print("----- Decision Tree 최적의 하이퍼파라미터 -----")
print(grid_search_dt.best_params_)

y_pred_dt = best_dt.predict(X_val)
y_prob_dt = best_dt.predict_proba(X_val)[:, 1]
acc_dt = accuracy_score(y_val, y_pred_dt)
roc_dt = roc_auc_score(y_val, y_prob_dt)
conf_dt = confusion_matrix(y_val, y_pred_dt)
print("Decision Tree Accuracy: {:.4f}".format(acc_dt))
print("Decision Tree ROC-AUC: {:.4f}".format(roc_dt))
print("Decision Tree Confusion Matrix:")
print(conf_dt)
print("============================================\n")

#####################################
# 3. Model 2: RandomForest (GridSearchCV 활용)
#####################################
param_grid_rf = {
    'n_estimators': [500],
    'max_depth': [200, 300],
    'min_samples_split': [20],
    'min_samples_leaf': [8]
}
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
print("----- RandomForest 최적의 하이퍼파라미터 -----")
print(grid_search_rf.best_params_)

y_pred_rf = best_rf.predict(X_val)
y_prob_rf = best_rf.predict_proba(X_val)[:, 1]
acc_rf = accuracy_score(y_val, y_pred_rf)
roc_rf = roc_auc_score(y_val, y_prob_rf)
conf_rf = confusion_matrix(y_val, y_pred_rf)
print("RandomForest Accuracy: {:.4f}".format(acc_rf))
print("RandomForest ROC-AUC: {:.4f}".format(roc_rf))
print("RandomForest Confusion Matrix:")
print(conf_rf)
print("============================================\n")

#####################################
# 4. Model 3: LightGBM (기본 모델, GridSearchCV 활용)
#####################################
param_grid_lgb_2 = {
    'learning_rate': [0.01],
    'max_depth': [-1],
    'n_estimators': [300],
    'num_leaves': [31]
}
lgb_model_plain = lgb.LGBMClassifier(random_state=42)
grid_search_lgb_plain = GridSearchCV(
    estimator=lgb_model_plain,
    param_grid=param_grid_lgb_2,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search_lgb_plain.fit(X_train, y_train)
best_lgb_plain = grid_search_lgb_plain.best_estimator_
print("----- LightGBM (기본 모델) 최적의 하이퍼파라미터 -----")
print(grid_search_lgb_plain.best_params_)

y_pred_lgb_plain = best_lgb_plain.predict(X_val)
y_prob_lgb_plain = best_lgb_plain.predict_proba(X_val)[:, 1]
acc_lgb_plain = accuracy_score(y_val, y_pred_lgb_plain)
roc_lgb_plain = roc_auc_score(y_val, y_prob_lgb_plain)
conf_lgb_plain = confusion_matrix(y_val, y_pred_lgb_plain)
print("LightGBM (기본) Accuracy: {:.4f}".format(acc_lgb_plain))
print("LightGBM (기본) ROC-AUC: {:.4f}".format(roc_lgb_plain))
print("LightGBM (기본) Confusion Matrix:")
print(conf_lgb_plain)
print("============================================\n")

#####################################
# 5. Model 4: LightGBM (GridSearchCV 활용)
#####################################
param_grid_lgb = {
    'n_estimators': [500],
    'learning_rate': [0.01],
    'num_leaves': [70],
    'max_depth': [20]
}
lgb_estimator = lgb.LGBMClassifier(random_state=42)
grid_search_lgb = GridSearchCV(
    estimator=lgb_estimator,
    param_grid=param_grid_lgb,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search_lgb.fit(X_train, y_train)
best_lgb = grid_search_lgb.best_estimator_
print("----- LightGBM (튜닝 모델) 최적의 하이퍼파라미터 -----")
print(grid_search_lgb.best_params_)

y_pred_lgb = best_lgb.predict(X_val)
y_prob_lgb = best_lgb.predict_proba(X_val)[:, 1]
acc_lgb = accuracy_score(y_val, y_pred_lgb)
roc_lgb = roc_auc_score(y_val, y_prob_lgb)
conf_lgb = confusion_matrix(y_val, y_pred_lgb)
print("LightGBM (튜닝) Accuracy: {:.4f}".format(acc_lgb))
print("LightGBM (튜닝) ROC-AUC: {:.4f}".format(roc_lgb))
print("LightGBM (튜닝) Confusion Matrix:")
print(conf_lgb)
print("============================================\n")

#####################################
# 6. Ensemble: 4개 모델의 예측 확률 평균 기반 보팅
#####################################
# 각 모델의 양성(임신 성공) 클래스에 대한 예측 확률
ensemble_prob = (y_prob_dt + y_prob_rf + y_prob_lgb_plain + y_prob_lgb) / 4
ensemble_pred = (ensemble_prob >= 0.5).astype(int)

ensemble_acc = accuracy_score(y_val, ensemble_pred)
ensemble_roc = roc_auc_score(y_val, ensemble_prob)
ensemble_conf = confusion_matrix(y_val, ensemble_pred)

print("----- Ensemble 결과 (4개 모델 평균) -----")
print("Ensemble Accuracy: {:.4f}".format(ensemble_acc))
print("Ensemble ROC-AUC: {:.4f}".format(ensemble_roc))
print("Ensemble Confusion Matrix:")
print(ensemble_conf)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
----- Decision Tree 최적의 하이퍼파라미터 -----
{'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 15, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 16, 'min_samples_split': 20, 'splitter': 'best'}
Decision Tree Accuracy: 0.6606
Decision Tree ROC-AUC: 0.7144
Decision Tree Confusion Matrix:
[[ 9707  9915]
 [ 3404 16217]]

Fitting 5 folds for each of 2 candidates, totalling 10 fits
----- RandomForest 최적의 하이퍼파라미터 -----
{'max_depth': 200, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 500}
RandomForest Accuracy: 0.6626
RandomForest ROC-AUC: 0.7166
RandomForest Confusion Matrix:
[[10598  9024]
 [ 4216 15405]]

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[LightGBM] [Info] Number of positive: 45783, number of negative: 45782
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008100 seconds.
You can set `force_row_wise=true`

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

#####################################
# 1. 데이터 불러오기 및 전처리
#####################################
# 학습/테스트 데이터 로드 (cp949 인코딩)
train_data = pd.read_csv('train2.csv', encoding='cp949')
test_data = pd.read_csv('test2.csv', encoding='cp949')

# 다운샘플링: 클래스 불균형 해소 (임신 성공 여부 0 vs 1)
df_majority = train_data[train_data['임신 성공 여부'] == 0]
df_minority = train_data[train_data['임신 성공 여부'] == 1]
df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# 사용할 칼럼 외 제거 (불필요한 정보 배제)
cols_to_drop = ['ID', '시술 시기 코드', '시술 유형', '임신 성공 여부',
                '여성 주 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
                'IVF 시술 횟수', 'IVF 출산 횟수', '혼합된 난자 수',
                '동결 배아 사용 여부', '신선 배아 사용 여부']
X = df_downsampled.drop(columns=cols_to_drop)
y = df_downsampled['임신 성공 여부']

# 학습/검증 데이터 분리 (stratify로 클래스 비율 유지)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

#####################################
# 2. Model 1: Decision Tree (깊은 하이퍼파라미터 탐색)
#####################################
param_grid_dt = {
    'criterion': ['entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [15],
    'min_samples_split': [20],
    'min_samples_leaf': [16],
    'max_features': [None],
    'max_leaf_nodes': [50],
    'min_impurity_decrease': [0.0],
    'ccp_alpha': [0.0]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_dt,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_dt.fit(X_train, y_train)
best_dt = grid_search_dt.best_estimator_
print("----- Decision Tree 최적의 하이퍼파라미터 -----")
print(grid_search_dt.best_params_)

y_pred_dt = best_dt.predict(X_val)
y_prob_dt = best_dt.predict_proba(X_val)[:, 1]
acc_dt = accuracy_score(y_val, y_pred_dt)
roc_dt = roc_auc_score(y_val, y_prob_dt)
conf_dt = confusion_matrix(y_val, y_pred_dt)
print("Decision Tree Accuracy: {:.4f}".format(acc_dt))
print("Decision Tree ROC-AUC: {:.4f}".format(roc_dt))
print("Decision Tree Confusion Matrix:")
print(conf_dt)
print("============================================\n")

#####################################
# 3. Model 2: RandomForest (GridSearchCV 활용)
#####################################
param_grid_rf = {
    'n_estimators': [500],
    'max_depth': [200, 300],
    'min_samples_split': [20],
    'min_samples_leaf': [8]
}
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
print("----- RandomForest 최적의 하이퍼파라미터 -----")
print(grid_search_rf.best_params_)

y_pred_rf = best_rf.predict(X_val)
y_prob_rf = best_rf.predict_proba(X_val)[:, 1]
acc_rf = accuracy_score(y_val, y_pred_rf)
roc_rf = roc_auc_score(y_val, y_prob_rf)
conf_rf = confusion_matrix(y_val, y_pred_rf)
print("RandomForest Accuracy: {:.4f}".format(acc_rf))
print("RandomForest ROC-AUC: {:.4f}".format(roc_rf))
print("RandomForest Confusion Matrix:")
print(conf_rf)
print("============================================\n")

#####################################
# 4. Model 3: LightGBM (기본 모델, GridSearchCV 활용)
#####################################
param_grid_lgb_2 = {
    'learning_rate': [0.01],
    'max_depth': [-1],
    'n_estimators': [300],
    'num_leaves': [31]
}
lgb_model_plain = lgb.LGBMClassifier(random_state=42)
grid_search_lgb_plain = GridSearchCV(
    estimator=lgb_model_plain,
    param_grid=param_grid_lgb_2,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search_lgb_plain.fit(X_train, y_train)
best_lgb_plain = grid_search_lgb_plain.best_estimator_
print("----- LightGBM (기본 모델) 최적의 하이퍼파라미터 -----")
print(grid_search_lgb_plain.best_params_)

y_pred_lgb_plain = best_lgb_plain.predict(X_val)
y_prob_lgb_plain = best_lgb_plain.predict_proba(X_val)[:, 1]
acc_lgb_plain = accuracy_score(y_val, y_pred_lgb_plain)
roc_lgb_plain = roc_auc_score(y_val, y_prob_lgb_plain)
conf_lgb_plain = confusion_matrix(y_val, y_pred_lgb_plain)
print("LightGBM (기본) Accuracy: {:.4f}".format(acc_lgb_plain))
print("LightGBM (기본) ROC-AUC: {:.4f}".format(roc_lgb_plain))
print("LightGBM (기본) Confusion Matrix:")
print(conf_lgb_plain)
print("============================================\n")

#####################################
# 5. Model 4: LightGBM (튜닝 모델, GridSearchCV 활용)
#####################################
param_grid_lgb = {
    'n_estimators': [500],
    'learning_rate': [0.01],
    'num_leaves': [70],
    'max_depth': [20]
}
lgb_estimator = lgb.LGBMClassifier(random_state=42)
grid_search_lgb = GridSearchCV(
    estimator=lgb_estimator,
    param_grid=param_grid_lgb,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search_lgb.fit(X_train, y_train)
best_lgb = grid_search_lgb.best_estimator_
print("----- LightGBM (튜닝 모델) 최적의 하이퍼파라미터 -----")
print(grid_search_lgb.best_params_)

y_pred_lgb = best_lgb.predict(X_val)
y_prob_lgb = best_lgb.predict_proba(X_val)[:, 1]
acc_lgb = accuracy_score(y_val, y_pred_lgb)
roc_lgb = roc_auc_score(y_val, y_prob_lgb)
conf_lgb = confusion_matrix(y_val, y_pred_lgb)
print("LightGBM (튜닝) Accuracy: {:.4f}".format(acc_lgb))
print("LightGBM (튜닝) ROC-AUC: {:.4f}".format(roc_lgb))
print("LightGBM (튜닝) Confusion Matrix:")
print(conf_lgb)
print("============================================\n")

#####################################
# 6. Model 5: Logistic Regression (GridSearchCV 활용)
#####################################
from sklearn.linear_model import LogisticRegression

param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [1000]
}
lr = LogisticRegression(random_state=42)
grid_search_lr = GridSearchCV(
    estimator=lr,
    param_grid=param_grid_lr,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_lr.fit(X_train, y_train)
best_lr = grid_search_lr.best_estimator_
print("----- Logistic Regression 최적의 하이퍼파라미터 -----")
print(grid_search_lr.best_params_)

y_pred_lr = best_lr.predict(X_val)
y_prob_lr = best_lr.predict_proba(X_val)[:, 1]
acc_lr = accuracy_score(y_val, y_pred_lr)
roc_lr = roc_auc_score(y_val, y_prob_lr)
conf_lr = confusion_matrix(y_val, y_pred_lr)
print("Logistic Regression Accuracy: {:.4f}".format(acc_lr))
print("Logistic Regression ROC-AUC: {:.4f}".format(roc_lr))
print("Logistic Regression Confusion Matrix:")
print(conf_lr)
print("============================================\n")

#####################################
# 7. Model 6: XGBoost (GridSearchCV 활용)
#####################################
from xgboost import XGBClassifier

param_grid_xgb = {
    'n_estimators': [200],
    'max_depth': [7],
    'learning_rate': [0.001, 0.01],
    'subsample': [0.8, 1.0]
}
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_xgb.fit(X_train, y_train)
best_xgb = grid_search_xgb.best_estimator_
print("----- XGBoost 최적의 하이퍼파라미터 -----")
print(grid_search_xgb.best_params_)

y_pred_xgb = best_xgb.predict(X_val)
y_prob_xgb = best_xgb.predict_proba(X_val)[:, 1]
acc_xgb = accuracy_score(y_val, y_pred_xgb)
roc_xgb = roc_auc_score(y_val, y_prob_xgb)
conf_xgb = confusion_matrix(y_val, y_pred_xgb)
print("XGBoost Accuracy: {:.4f}".format(acc_xgb))
print("XGBoost ROC-AUC: {:.4f}".format(roc_xgb))
print("XGBoost Confusion Matrix:")
print(conf_xgb)
print("============================================\n")

#####################################
# 8. Ensemble: 6개 모델의 예측 확률 평균 기반 보팅
#####################################
# 각 모델의 양성(임신 성공) 클래스에 대한 예측 확률
ensemble_prob = (y_prob_dt + y_prob_rf + y_prob_lgb_plain + y_prob_lgb + y_prob_lr + y_prob_xgb) / 6
ensemble_pred = (ensemble_prob >= 0.5).astype(int)

ensemble_acc = accuracy_score(y_val, ensemble_pred)
ensemble_roc = roc_auc_score(y_val, ensemble_prob)
ensemble_conf = confusion_matrix(y_val, ensemble_pred)

print("----- Ensemble 결과 (6개 모델 평균) -----")
print("Ensemble Accuracy: {:.4f}".format(ensemble_acc))
print("Ensemble ROC-AUC: {:.4f}".format(ensemble_roc))
print("Ensemble Confusion Matrix:")
print(ensemble_conf)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
----- Decision Tree 최적의 하이퍼파라미터 -----
{'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 15, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 16, 'min_samples_split': 20, 'splitter': 'best'}
Decision Tree Accuracy: 0.6606
Decision Tree ROC-AUC: 0.7144
Decision Tree Confusion Matrix:
[[ 9707  9915]
 [ 3404 16217]]

Fitting 5 folds for each of 2 candidates, totalling 10 fits
----- RandomForest 최적의 하이퍼파라미터 -----
{'max_depth': 200, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 500}
RandomForest Accuracy: 0.6626
RandomForest ROC-AUC: 0.7166
RandomForest Confusion Matrix:
[[10598  9024]
 [ 4216 15405]]

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[LightGBM] [Info] Number of positive: 45783, number of negative: 45782
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006391 seconds.
You can set `force_row_wise=true`

Parameters: { "use_label_encoder" } are not used.



----- XGBoost 최적의 하이퍼파라미터 -----
{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
XGBoost Accuracy: 0.6625
XGBoost ROC-AUC: 0.7186
XGBoost Confusion Matrix:
[[10573  9049]
 [ 4195 15426]]

----- Ensemble 결과 (6개 모델 평균) -----
Ensemble Accuracy: 0.6651
Ensemble ROC-AUC: 0.7193
Ensemble Confusion Matrix:
[[10413  9209]
 [ 3935 15686]]


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier

#####################################
# 1. 데이터 불러오기 및 전처리
#####################################
# 학습/테스트 데이터 로드 (cp949 인코딩)
train_data = pd.read_csv('train2.csv', encoding='cp949')
test_data = pd.read_csv('test2.csv', encoding='cp949')

# 다운샘플링: 클래스 불균형 해소 (임신 성공 여부 0 vs 1)
df_majority = train_data[train_data['임신 성공 여부'] == 0]
df_minority = train_data[train_data['임신 성공 여부'] == 1]
df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# 사용할 칼럼 외 제거 (불필요한 정보 배제)
cols_to_drop = ['ID', '시술 시기 코드', '시술 유형', '임신 성공 여부',
                '여성 주 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
                'IVF 시술 횟수', 'IVF 출산 횟수', '혼합된 난자 수',
                '동결 배아 사용 여부', '신선 배아 사용 여부']
X = df_downsampled.drop(columns=cols_to_drop)
y = df_downsampled['임신 성공 여부']

# 학습/검증 데이터 분리 (stratify로 클래스 비율 유지)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

#####################################
# 2. Model 1: Decision Tree (깊은 하이퍼파라미터 탐색)
#####################################
param_grid_dt = {
    'criterion': ['entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [15],
    'min_samples_split': [20],
    'min_samples_leaf': [16],
    'max_features': [None],
    'max_leaf_nodes': [50],
    'min_impurity_decrease': [0.0],
    'ccp_alpha': [0.0]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_dt,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_dt.fit(X_train, y_train)
best_dt = grid_search_dt.best_estimator_
print("----- Decision Tree 최적의 하이퍼파라미터 -----")
print(grid_search_dt.best_params_)

y_pred_dt = best_dt.predict(X_val)
y_prob_dt = best_dt.predict_proba(X_val)[:, 1]
acc_dt = accuracy_score(y_val, y_pred_dt)
roc_dt = roc_auc_score(y_val, y_prob_dt)
conf_dt = confusion_matrix(y_val, y_pred_dt)
print("Decision Tree Accuracy: {:.4f}".format(acc_dt))
print("Decision Tree ROC-AUC: {:.4f}".format(roc_dt))
print("Decision Tree Confusion Matrix:")
print(conf_dt)
print("============================================\n")

#####################################
# 3. Model 2: RandomForest (GridSearchCV 활용)
#####################################
param_grid_rf = {
    'n_estimators': [500],
    'max_depth': [200, 300],
    'min_samples_split': [20],
    'min_samples_leaf': [8]
}
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
print("----- RandomForest 최적의 하이퍼파라미터 -----")
print(grid_search_rf.best_params_)

y_pred_rf = best_rf.predict(X_val)
y_prob_rf = best_rf.predict_proba(X_val)[:, 1]
acc_rf = accuracy_score(y_val, y_pred_rf)
roc_rf = roc_auc_score(y_val, y_prob_rf)
conf_rf = confusion_matrix(y_val, y_pred_rf)
print("RandomForest Accuracy: {:.4f}".format(acc_rf))
print("RandomForest ROC-AUC: {:.4f}".format(roc_rf))
print("RandomForest Confusion Matrix:")
print(conf_rf)
print("============================================\n")

#####################################
# 4. Model 3: LightGBM (기본 모델, GridSearchCV 활용)
#####################################
param_grid_lgb_2 = {
    'learning_rate': [0.01],
    'max_depth': [-1],
    'n_estimators': [300],
    'num_leaves': [31]
}
lgb_model_plain = lgb.LGBMClassifier(random_state=42)
grid_search_lgb_plain = GridSearchCV(
    estimator=lgb_model_plain,
    param_grid=param_grid_lgb_2,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search_lgb_plain.fit(X_train, y_train)
best_lgb_plain = grid_search_lgb_plain.best_estimator_
print("----- LightGBM (기본 모델) 최적의 하이퍼파라미터 -----")
print(grid_search_lgb_plain.best_params_)

y_pred_lgb_plain = best_lgb_plain.predict(X_val)
y_prob_lgb_plain = best_lgb_plain.predict_proba(X_val)[:, 1]
acc_lgb_plain = accuracy_score(y_val, y_pred_lgb_plain)
roc_lgb_plain = roc_auc_score(y_val, y_prob_lgb_plain)
conf_lgb_plain = confusion_matrix(y_val, y_pred_lgb_plain)
print("LightGBM (기본) Accuracy: {:.4f}".format(acc_lgb_plain))
print("LightGBM (기본) ROC-AUC: {:.4f}".format(roc_lgb_plain))
print("LightGBM (기본) Confusion Matrix:")
print(conf_lgb_plain)
print("============================================\n")

#####################################
# 5. Model 4: LightGBM (튜닝 모델, GridSearchCV 활용)
#####################################
param_grid_lgb = {
    'n_estimators': [500],
    'learning_rate': [0.01],
    'num_leaves': [70],
    'max_depth': [20]
}
lgb_estimator = lgb.LGBMClassifier(random_state=42)
grid_search_lgb = GridSearchCV(
    estimator=lgb_estimator,
    param_grid=param_grid_lgb,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search_lgb.fit(X_train, y_train)
best_lgb = grid_search_lgb.best_estimator_
print("----- LightGBM (튜닝 모델) 최적의 하이퍼파라미터 -----")
print(grid_search_lgb.best_params_)

y_pred_lgb = best_lgb.predict(X_val)
y_prob_lgb = best_lgb.predict_proba(X_val)[:, 1]
acc_lgb = accuracy_score(y_val, y_pred_lgb)
roc_lgb = roc_auc_score(y_val, y_prob_lgb)
conf_lgb = confusion_matrix(y_val, y_pred_lgb)
print("LightGBM (튜닝) Accuracy: {:.4f}".format(acc_lgb))
print("LightGBM (튜닝) ROC-AUC: {:.4f}".format(roc_lgb))
print("LightGBM (튜닝) Confusion Matrix:")
print(conf_lgb)
print("============================================\n")

#####################################
# 6. Model 5: Logistic Regression (GridSearchCV 활용)
#####################################
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [1000]
}
lr = LogisticRegression(random_state=42)
grid_search_lr = GridSearchCV(
    estimator=lr,
    param_grid=param_grid_lr,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_lr.fit(X_train, y_train)
best_lr = grid_search_lr.best_estimator_
print("----- Logistic Regression 최적의 하이퍼파라미터 -----")
print(grid_search_lr.best_params_)

y_pred_lr = best_lr.predict(X_val)
y_prob_lr = best_lr.predict_proba(X_val)[:, 1]
acc_lr = accuracy_score(y_val, y_pred_lr)
roc_lr = roc_auc_score(y_val, y_prob_lr)
conf_lr = confusion_matrix(y_val, y_pred_lr)
print("Logistic Regression Accuracy: {:.4f}".format(acc_lr))
print("Logistic Regression ROC-AUC: {:.4f}".format(roc_lr))
print("Logistic Regression Confusion Matrix:")
print(conf_lr)
print("============================================\n")

#####################################
# 7. Model 6: XGBoost (GridSearchCV 활용)
#####################################
param_grid_xgb = {
    'n_estimators': [200],
    'max_depth': [7],
    'learning_rate': [0.01],
    'subsample': [0.8]
}
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_xgb.fit(X_train, y_train)
best_xgb = grid_search_xgb.best_estimator_
print("----- XGBoost 최적의 하이퍼파라미터 -----")
print(grid_search_xgb.best_params_)

y_pred_xgb = best_xgb.predict(X_val)
y_prob_xgb = best_xgb.predict_proba(X_val)[:, 1]
acc_xgb = accuracy_score(y_val, y_pred_xgb)
roc_xgb = roc_auc_score(y_val, y_prob_xgb)
conf_xgb = confusion_matrix(y_val, y_pred_xgb)
print("XGBoost Accuracy: {:.4f}".format(acc_xgb))
print("XGBoost ROC-AUC: {:.4f}".format(roc_xgb))
print("XGBoost Confusion Matrix:")
print(conf_xgb)
print("============================================\n")

#####################################
# 8. Model 7: ExtraTreesClassifier (GridSearchCV 활용)
#####################################
param_grid_et = {
    'n_estimators': [300],
    'max_depth': [20],
    'min_samples_split': [5],
    'min_samples_leaf': [2]
}
et = ExtraTreesClassifier(random_state=42)
grid_search_et = GridSearchCV(
    estimator=et,
    param_grid=param_grid_et,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search_et.fit(X_train, y_train)
best_et = grid_search_et.best_estimator_
print("----- ExtraTrees 최적의 하이퍼파라미터 -----")
print(grid_search_et.best_params_)

y_pred_et = best_et.predict(X_val)
y_prob_et = best_et.predict_proba(X_val)[:, 1]
acc_et = accuracy_score(y_val, y_pred_et)
roc_et = roc_auc_score(y_val, y_prob_et)
conf_et = confusion_matrix(y_val, y_pred_et)
print("ExtraTrees Accuracy: {:.4f}".format(acc_et))
print("ExtraTrees ROC-AUC: {:.4f}".format(roc_et))
print("ExtraTrees Confusion Matrix:")
print(conf_et)
print("============================================\n")


#####################################
# 11. Ensemble: 9개 모델의 예측 확률 평균 기반 보팅
#####################################
ensemble_prob = (y_prob_dt + y_prob_rf + y_prob_lgb_plain + y_prob_lgb + y_prob_lr + 
                 y_prob_xgb + y_prob_et) / 7
ensemble_pred = (ensemble_prob >= 0.5).astype(int)

ensemble_acc = accuracy_score(y_val, ensemble_pred)
ensemble_roc = roc_auc_score(y_val, ensemble_prob)
ensemble_conf = confusion_matrix(y_val, ensemble_pred)

print("----- Ensemble 결과 (7개 모델 평균) -----")
print("Ensemble Accuracy: {:.4f}".format(ensemble_acc))
print("Ensemble ROC-AUC: {:.4f}".format(ensemble_roc))
print("Ensemble Confusion Matrix:")
print(ensemble_conf)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
----- Decision Tree 최적의 하이퍼파라미터 -----
{'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 15, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 16, 'min_samples_split': 20, 'splitter': 'best'}
Decision Tree Accuracy: 0.6606
Decision Tree ROC-AUC: 0.7144
Decision Tree Confusion Matrix:
[[ 9707  9915]
 [ 3404 16217]]

Fitting 5 folds for each of 2 candidates, totalling 10 fits
----- RandomForest 최적의 하이퍼파라미터 -----
{'max_depth': 200, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 500}
RandomForest Accuracy: 0.6626
RandomForest ROC-AUC: 0.7166
RandomForest Confusion Matrix:
[[10598  9024]
 [ 4216 15405]]

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[LightGBM] [Info] Number of positive: 45783, number of negative: 45782
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008696 seconds.
You can set `force_row_wise=true`

Parameters: { "use_label_encoder" } are not used.



----- XGBoost 최적의 하이퍼파라미터 -----
{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
XGBoost Accuracy: 0.6625
XGBoost ROC-AUC: 0.7186
XGBoost Confusion Matrix:
[[10573  9049]
 [ 4195 15426]]

Fitting 5 folds for each of 1 candidates, totalling 5 fits
----- ExtraTrees 최적의 하이퍼파라미터 -----
{'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
ExtraTrees Accuracy: 0.6605
ExtraTrees ROC-AUC: 0.7145
ExtraTrees Confusion Matrix:
[[10308  9314]
 [ 4010 15611]]

----- Ensemble 결과 (7개 모델 평균) -----
Ensemble Accuracy: 0.6654
Ensemble ROC-AUC: 0.7193
Ensemble Confusion Matrix:
[[10428  9194]
 [ 3935 15686]]


In [26]:
####################################
# 2. 테스트 데이터 예측
####################################
# 테스트 데이터 불러오기 (cp949 인코딩)
test_data = pd.read_csv('test2.csv', encoding='cp949')

# 테스트 데이터 전처리: 학습 시 사용했던 제외할 칼럼과 동일하게 제거
cols_to_drop_test = ['ID', '시술 시기 코드', '시술 유형', '임신 성공 여부',
                     '여성 주 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
                     'IVF 시술 횟수', 'IVF 출산 횟수', '혼합된 난자 수',
                     '동결 배아 사용 여부', '신선 배아 사용 여부']
X_test = test_data.drop(columns=cols_to_drop_test)

# 각 모델의 예측 확률 계산
# (모든 모델은 학습 시 GridSearchCV를 통해 최적화된 최적 모델입니다.)
test_prob_dt      = best_dt.predict_proba(X_test)[:, 1]
test_prob_rf      = best_rf.predict_proba(X_test)[:, 1]
test_prob_lgb_plain = best_lgb_plain.predict_proba(X_test)[:, 1]
test_prob_lgb     = best_lgb.predict_proba(X_test)[:, 1]
test_prob_lr      = best_lr.predict_proba(X_test)[:, 1]
test_prob_xgb     = best_xgb.predict_proba(X_test)[:, 1]
test_prob_et      = best_et.predict_proba(X_test)[:, 1]

# 7개 모델의 예측 확률 평균 계산 (Ensemble)
ensemble_test_prob = (test_prob_dt + test_prob_rf + test_prob_lgb_plain +
                      test_prob_lgb + test_prob_lr + test_prob_xgb + test_prob_et) / 7

# 평균 확률이 0.5 이상이면 1, 미만이면 0으로 최종 예측
ensemble_test_pred = (ensemble_test_prob >= 0.5).astype(int)

# 예측 결과를 테스트 데이터에 추가
test_data['임신 성공 여부 예측'] = ensemble_test_pred
test_data['임신 성공 여부 예측_확률'] = ensemble_test_prob

# 결과 확인
print(test_data[['ID', '임신 성공 여부 예측', '임신 성공 여부 예측_확률']].head())

# 예측 결과를 CSV 파일로 저장
test_data.to_csv('test_ivf_ensemble_predictions_0225_2734.csv', index=False, encoding='cp949')

           ID  임신 성공 여부 예측  임신 성공 여부 예측_확률
0  TEST_00000            0        0.048366
1  TEST_00001            0        0.048560
2  TEST_00002            0        0.421340
3  TEST_00003            0        0.270972
4  TEST_00004            1        0.630890


In [28]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# -------------------------------
# 이미 학습된 개별 모델(best_dt, best_rf, best_lgb_plain, best_lgb, best_lr, best_xgb, best_et)
# 이 모델들은 이전 코드에서 GridSearchCV를 통해 최적화된 모델들입니다.
# -------------------------------

# 스태킹 앙상블 베이스 모델로 사용할 모델 리스트 구성
estimators = [
    ('dt', best_dt),
    ('rf', best_rf),
    ('lgb_plain', best_lgb_plain),
    ('lgb_tuned', best_lgb),
    ('lr', best_lr),
    ('xgb', best_xgb),
    ('et', best_et)
]

# 최종 메타 모델 (예시: Logistic Regression)
meta_estimator = LogisticRegression(random_state=42, max_iter=1000)

# StackingClassifier 구성 (passthrough=True 옵션은 원본 특성도 메타 모델에 함께 전달합니다)
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_estimator,
    cv=5,
    n_jobs=-1,
    passthrough=True
)

# 스태킹 앙상블 학습
stacking_clf.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_pred_stack = stacking_clf.predict(X_val)
y_prob_stack = stacking_clf.predict_proba(X_val)[:, 1]

# 평가 지표 계산
acc_stack = accuracy_score(y_val, y_pred_stack)
roc_stack = roc_auc_score(y_val, y_prob_stack)
conf_stack = confusion_matrix(y_val, y_pred_stack)

print("----- Stacking Ensemble 결과 -----")
print("Stacking Ensemble Accuracy: {:.4f}".format(acc_stack))
print("Stacking Ensemble ROC-AUC: {:.4f}".format(roc_stack))
print("Stacking Ensemble Confusion Matrix:")
print(conf_stack)

----- Stacking Ensemble 결과 -----
Stacking Ensemble Accuracy: 0.6638
Stacking Ensemble ROC-AUC: 0.7174
Stacking Ensemble Confusion Matrix:
[[10527  9095]
 [ 4097 15524]]


In [30]:
####################################
# 2. 테스트 데이터 예측 (Stacking Ensemble 사용)
####################################
# 테스트 데이터 불러오기 (cp949 인코딩)
test_data = pd.read_csv('test2.csv', encoding='cp949')

# 테스트 데이터 전처리: 학습 시 사용했던 제외할 칼럼과 동일하게 제거
cols_to_drop_test = ['ID', '시술 시기 코드', '시술 유형', '임신 성공 여부',
                     '여성 주 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
                     'IVF 시술 횟수', 'IVF 출산 횟수', '혼합된 난자 수',
                     '동결 배아 사용 여부', '신선 배아 사용 여부']
X_test = test_data.drop(columns=cols_to_drop_test)

# StackingClassifier를 사용한 테스트 데이터 예측
# (stacking_clf는 이전에 베이스 모델들과 메타 모델을 사용해 학습된 StackingClassifier입니다.)
y_test_pred = stacking_clf.predict(X_test)
y_test_prob = stacking_clf.predict_proba(X_test)[:, 1]

# 예측 결과를 테스트 데이터에 추가
test_data['임신 성공 여부 예측'] = y_test_pred
test_data['임신 성공 여부 예측_확률'] = y_test_prob

# 결과 확인
print(test_data[['ID', '임신 성공 여부 예측', '임신 성공 여부 예측_확률']].head())

# 예측 결과를 CSV 파일로 저장
test_data.to_csv('test_ivf_ensemble_predictions_0225_stacking.csv', index=False, encoding='cp949')

           ID  임신 성공 여부 예측  임신 성공 여부 예측_확률
0  TEST_00000            0        0.029528
1  TEST_00001            0        0.036881
2  TEST_00002            0        0.374338
3  TEST_00003            0        0.229708
4  TEST_00004            1        0.648581
