# Importing Libraries


In [126]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder #conda install category_encoders
from xgboost import XGBClassifier #conda install xgboost
from skopt import BayesSearchCV #conda install -c conda-forge scikit-optimize
from skopt.space import Real, Categorical, Integer
from xgboost import plot_importance
import matplotlib.pyplot as plt

# Preparing data

In [127]:
df = pd.read_csv('data.csv')
df = df[["", "Sex"]]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    After-tax net Int

0    6599
1     220
Name: Bankrupt?, dtype: int64

In [128]:
X = df.drop(columns='Bankrupt?')
y = df['Bankrupt?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

# Training Pipeline for Model

In [129]:
estimators = [
    ('encoder', TargetEncoder(cols=[' Liability-Assets Flag',' Net Income Flag'])),
    ('clf', XGBClassifier(random_state=0)) # can customize objective function with the objective parameter
]
pipe = Pipeline(estimators)

# Hyperparameter Tuning

In [130]:
search_space = {
    'clf__max_depth': Integer(2,10),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}

In [131]:
opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=10, scoring='roc_auc', random_state=0)

In [132]:
opt.fit(X_train, y_train)

BayesSearchCV(cv=3,
              estimator=Pipeline(steps=[('encoder',
                                         TargetEncoder(cols=[' '
                                                             'Liability-Assets '
                                                             'Flag',
                                                             ' Net Income '
                                                             'Flag'])),
                                        ('clf',
                                         XGBClassifier(base_score=None,
                                                       booster=None,
                                                       callbacks=None,
                                                       colsample_bylevel=None,
                                                       colsample_bynode=None,
                                                       colsample_bytree=None,
                                                       early_stopping_

# Evalutation of Model

In [133]:
opt.best_estimator_

Pipeline(steps=[('encoder',
                 TargetEncoder(cols=[' Liability-Assets Flag',
                                     ' Net Income Flag'])),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=0.5737099059755859,
                               colsample_bynode=0.8916331353364969,
                               colsample_bytree=0.7349720201814586,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None...
                               gpu_id=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None,
                               learning_rate=0.2842833771983085, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                     

In [134]:
opt.best_score_

0.9342617348477272

In [135]:
opt.score(X_test, y_test)

0.9459366391184573

In [137]:
opt.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [138]:
opt.predict_proba(X_test)

array([[0.9967339 , 0.0032661 ],
       [0.9876039 , 0.01239611],
       [0.99524075, 0.00475923],
       ...,
       [0.9579474 , 0.04205265],
       [0.9836416 , 0.0163584 ],
       [0.99582815, 0.00417185]], dtype=float32)