In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import RobustScaler, quantile_transform, StandardScaler, MinMaxScaler
from imblearn.over_sampling import ADASYN , SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier # XGBoost
from lightgbm import LGBMClassifier # LightGBM
warnings.filterwarnings('ignore')

In [6]:
santander_train = pd.read_csv('../data/santander-customer-satisfaction/train.csv')
santander_test = pd.read_csv('../data/santander-customer-satisfaction/test.csv')

In [7]:
santander_train=santander_train.drop('ID', axis=1, inplace=False)
test_df=santander_test.drop('ID', axis=1, inplace=False)

In [37]:
santander_train['var3'].value_counts()
santander_train['var3'].replace(-999999, 0, inplace=True)

In [103]:
test_df['var3'].replace(-999999, 0, inplace=True)

---
## 스케일링 함수

---

In [8]:
def scale_data(data:pd.DataFrame, scaler:object)->pd.DataFrame:
    if scaler == 'Robust':
        scaler = RobustScaler()
        data_scaled = scaler.fit_transform(data)
    elif scaler == 'Quantile':
        data_scaled = quantile_transform(data, output_distribution='normal', random_state=150)
    elif scaler == 'Standard':
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
    elif scaler == 'MinMax':
        scaler = MinMaxScaler()
        data_scaled = scaler.fit_transform(data)
    else:
        raise 'Invalid Scaler'
    return data_scaled

In [9]:
X_features = santander_train.drop('TARGET', axis=1, inplace=False)
y_target = santander_train['TARGET']

In [10]:
y_target.value_counts()

TARGET
0    73012
1     3008
Name: count, dtype: int64

In [11]:
# 오버샘플링 함수
def sampling_data(data, target, sampler:object):
    if sampler == 'ada': 
        data , target = ADASYN(random_state=150).fit_resample(data, target)
    elif sampler == 'smote' : 
        data, target= SMOTE(random_state=150).fit_resample(data, target)
    else: raise 'Invalid Sampler'
    return data, target

In [38]:
X_data_sm, y_target_sm =  sampling_data(X_features, y_target, 'smote') # smote 오버샘플링
X_data_ada, y_target_ada =  sampling_data(X_features, y_target, 'ada') # ADASYN 오버샘플링

In [14]:
y_target_sm.value_counts()

TARGET
0    73012
1    73012
Name: count, dtype: int64

In [104]:
X_features_std = scale_data(X_data_sm, 'Standard')
X_test_std = scale_data(test_df, 'Standard')

In [105]:
X_features_scaled = scale_data(X_data_sm ,'Quantile')
X_test_scaled = scale_data(test_df, 'Quantile')

In [106]:
X_train, X_val, y_train, y_val = train_test_split(X_features_scaled, y_target_sm, test_size=0.3, random_state=150)
X_train_std, X_val_std, y_train_std, y_val_std = train_test_split(X_features_std, y_target_sm, test_size=0.3, random_state=150)

In [107]:
# XGBoost, LightGBM
xgb_clf = XGBClassifier(n_estimators=500, random_state=150, max_depth=8, learning_rate=0.05, n_jobs=-1, eval_metric='auc')
lgbm_clf = LGBMClassifier(n_estimators=500, random_state=150, max_depth=8, learning_rate=0.05, n_jobs=-1, eval_metric='auc')

In [108]:
xgb_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
lgbm_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])

[0]	validation_0-auc:0.93924
[1]	validation_0-auc:0.93987
[2]	validation_0-auc:0.94153
[3]	validation_0-auc:0.94216
[4]	validation_0-auc:0.94343
[5]	validation_0-auc:0.94698
[6]	validation_0-auc:0.94769
[7]	validation_0-auc:0.94794
[8]	validation_0-auc:0.94910
[9]	validation_0-auc:0.94911
[10]	validation_0-auc:0.94958
[11]	validation_0-auc:0.95153
[12]	validation_0-auc:0.95194
[13]	validation_0-auc:0.95248
[14]	validation_0-auc:0.95408
[15]	validation_0-auc:0.95531
[16]	validation_0-auc:0.95606
[17]	validation_0-auc:0.95671
[18]	validation_0-auc:0.95722
[19]	validation_0-auc:0.95771
[20]	validation_0-auc:0.95820
[21]	validation_0-auc:0.95870
[22]	validation_0-auc:0.95904
[23]	validation_0-auc:0.95948
[24]	validation_0-auc:0.95976
[25]	validation_0-auc:0.96022
[26]	validation_0-auc:0.96042
[27]	validation_0-auc:0.96073
[28]	validation_0-auc:0.96130
[29]	validation_0-auc:0.96174
[30]	validation_0-auc:0.96206
[31]	validation_0-auc:0.96280
[32]	validation_0-auc:0.96323
[33]	validation_0-au

In [109]:
xgb_pred = xgb_clf.predict(X_test_scaled)
lgbm_pred = lgbm_clf.predict(X_test_scaled)



In [110]:
values_df = pd.DataFrame(data=[xgb_pred, lgbm_pred], index=['XGBoost', 'LightGBM']).T

In [111]:
values_df['LightGBM'].value_counts()

LightGBM
1    42295
0    33523
Name: count, dtype: int64

In [112]:
values_df['XGBoost'].value_counts()

XGBoost
1    57103
0    18715
Name: count, dtype: int64

In [113]:
from sklearn.metrics import confusion_matrix, roc_auc_score
confusion_matrix(values_df['XGBoost'], values_df['LightGBM'])

array([[16910,  1805],
       [16613, 40490]])

In [114]:
roc_auc_score(values_df['XGBoost'], values_df['LightGBM'])

np.float64(0.8063114377608331)

In [115]:
meta_model = LogisticRegression()

# 스태킹 모델 정의
stacking_clf = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('lgbm', lgbm_clf)],
    final_estimator=meta_model,
    cv=5
)

# 모델 학습
stacking_clf.fit(X_train, y_train)

# 예측 및 평가
y_pred = stacking_clf.predict(X_test_scaled)
y_pred_proba = stacking_clf.predict_proba(X_test_scaled)[:, 1]


[LightGBM] [Info] Number of positive: 50840, number of negative: 51376
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17963
[LightGBM] [Info] Number of data points in the train set: 102216, number of used features: 258
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497378 -> initscore=-0.010488
[LightGBM] [Info] Start training from score -0.010488
[LightGBM] [Info] Number of positive: 40672, number of negative: 41100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17229
[LightGBM] [Info] Number of data points in the train set: 81772, number of used features: 252
[LightGBM] [I

In [116]:
print(y_pred)
print(y_pred_proba)

[1 1 1 ... 0 1 0]
[0.98328453 0.97911097 0.89855174 ... 0.05786252 0.97909473 0.02672316]


In [117]:
roc_auc_score(y_pred, y_pred_proba)

np.float64(1.0)

In [118]:
Stacking_submission = pd.DataFrame(data={'ID':santander_test['ID'], 'TARGET':y_pred})

In [120]:
Stacking_submission.to_csv('./santander_submission/Stacking_submission_smote.csv', index=False)

In [121]:
xgb_submission = pd.DataFrame(data={'ID':santander_test['ID'], 'TARGET':xgb_pred})
lgbm_submission = pd.DataFrame(data={'ID':santander_test['ID'], 'TARGET':lgbm_pred})

In [122]:
xgb_submission.to_csv('./santander_submission/xgb_submission_smote_.csv', index=False)
lgbm_submission.to_csv('./santander_submission/lgbm_submission_smote_.csv', index=False)

OSError: Cannot save file into a non-existent directory: 'santander'

---
## 하이퍼 파라미터 튜닝

---


In [62]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import roc_auc_score

In [123]:
# 하이퍼 파라미터 튜닝
def hyperopt_train_test(params):
    xgb_test = XGBClassifier(**params)
    return cross_val_score(xgb_clf, X_features_std, y_target_sm, cv=5).mean()

In [124]:
param_space = {
    'n_estimators': hp.choice('n_estimators', range(100, 1000)),
    'max_depth': hp.choice('max_depth', range(1, 10)),
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.02),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.01),
    'subsample': hp.quniform('subsample', 0.1, 1, 0.01),
    'gamma': hp.quniform('gamma', 0, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
    'eval_metric': hp.choice('eval_metric', ['auc'])
}

In [125]:
def f(params):
    acc = hyperopt_train_test(params)
    return {'loss': -acc, 'status': STATUS_OK}

In [126]:
trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=20, trials=trials)

100%|██████████| 20/20 [21:07<00:00, 63.39s/trial, best loss: -0.9348326910210005]


In [127]:
print(best)

{'colsample_bytree': np.float64(0.5), 'eval_metric': np.int64(0), 'gamma': np.float64(6.0), 'learning_rate': np.float64(0.16), 'max_depth': np.int64(8), 'min_child_weight': np.float64(8.0), 'n_estimators': np.int64(581), 'subsample': np.float64(0.8300000000000001)}


In [128]:
# 하이퍼 파라미터 튜닝 결과로 나온 최적의 파라미터
best_params = {
    "colsample_bytree": best['colsample_bytree'],
    'eval_metric': 'auc',
    'gamma': best['gamma'],
    'learning_rate': best['learning_rate'],
    'max_depth': best['max_depth'],
    'min_child_weight': best['min_child_weight'],
    'n_estimators': best['n_estimators'],
    'subsample': best['subsample']
}
xgb_hyperopt = XGBClassifier(**best_params)

In [129]:
xgb_hyperopt.fit(X_train, y_train, eval_set = [(X_val, y_val)])

[0]	validation_0-auc:0.92551
[1]	validation_0-auc:0.94757
[2]	validation_0-auc:0.95372
[3]	validation_0-auc:0.95545
[4]	validation_0-auc:0.95898
[5]	validation_0-auc:0.96034
[6]	validation_0-auc:0.96211
[7]	validation_0-auc:0.96320
[8]	validation_0-auc:0.96442
[9]	validation_0-auc:0.96543
[10]	validation_0-auc:0.96653
[11]	validation_0-auc:0.96761
[12]	validation_0-auc:0.96850
[13]	validation_0-auc:0.96917
[14]	validation_0-auc:0.96961
[15]	validation_0-auc:0.97038
[16]	validation_0-auc:0.97093
[17]	validation_0-auc:0.97141
[18]	validation_0-auc:0.97214
[19]	validation_0-auc:0.97289
[20]	validation_0-auc:0.97349
[21]	validation_0-auc:0.97376
[22]	validation_0-auc:0.97414
[23]	validation_0-auc:0.97444
[24]	validation_0-auc:0.97485
[25]	validation_0-auc:0.97495
[26]	validation_0-auc:0.97539
[27]	validation_0-auc:0.97581
[28]	validation_0-auc:0.97613
[29]	validation_0-auc:0.97631
[30]	validation_0-auc:0.97644
[31]	validation_0-auc:0.97665
[32]	validation_0-auc:0.97704
[33]	validation_0-au

In [130]:
xgb_hyperopt_pred = xgb_hyperopt.predict(X_test_std)

In [131]:
y_pred_proba_opt = xgb_hyperopt.predict_proba(X_test_std)[:, 1]

In [132]:
xgb_hyperopt_submission = pd.DataFrame(data={'ID':santander_test['ID'], 'TARGET':xgb_hyperopt_pred})

In [133]:
confusion_matrix(xgb_hyperopt_pred, y_pred)


array([[    0,     0],
       [18574, 57244]])

In [134]:
xgb_hyperopt_submission.to_csv('./santander_submission/xgb_hyperopt_submission.csv', index=False)