# Importing Libraries


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder #conda install category_encoders
from xgboost import XGBClassifier #conda install xgboost
from skopt.space import Real, Categorical, Integer
from xgboost import plot_importance
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)



# 1) Preparing data


In [31]:
df = pd.read_csv('cleaned.csv')
X = df.drop(columns='Bankrupt?')
y = df['Bankrupt?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

# 2a) Hyperparameter Tuning - XGBClassfier

In [32]:
xgbc_model = XGBClassifier(random_state=42)

params = {
    'learning_rate': stats.uniform(0.01, 0.3),  # Learning rate
    'max_depth': stats.randint(1, 10),  # Maximum depth of the tree
    'min_child_weight': stats.randint(1, 10),  # Minimum sum of weights of all observations required in a child node
    'subsample': stats.uniform(0.5, 0.5),  # Subsample ratio of the training instances
    'colsample_bytree': stats.uniform(0.5, 0.5),  # Subsample ratio of columns when constructing each tree
    'gamma': stats.uniform(0, 5),  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    'reg_alpha': stats.uniform(0, 1),  # L1 regularization term on weights
    'reg_lambda': stats.uniform(0, 1),  # L2 regularization term on weights
    'n_estimators': stats.randint(100, 1000)  # Number of trees in the forest
}

xgbc_random = RandomizedSearchCV(
    estimator=xgbc_model, 
    param_distributions=params, 
    n_iter=10, 
    cv=5, 
    random_state=42, 
    n_jobs=-1
)

# 2b) Hyperparameter Tuning - Random Forest Classifier

In [33]:
rf_model = RandomForestClassifier(random_state=42);
params = {
    'n_estimators': [50, 100, 150, 200], 
    'max_depth': [3, 5, 7, 10, None], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# define the Randomized Search Cross Validation
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=params, 
                               n_iter=10, cv=5, random_state=42, n_jobs=-1)

# 2c) Hyperparameter Tuning - Logistic Regression

In [34]:
logreg_model = LogisticRegression(random_state=42,max_iter=10000)
params = {
    'C': stats.uniform(loc=0, scale=4),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
}

logreg_random = RandomizedSearchCV(logreg_model, params, n_iter=10, cv=5, random_state=42, n_jobs=-1)


# 3) Evaluation of Models

In [43]:
models = [xgbc_random, rf_random,logreg_random]
models_name = ["XGBClassifier", "RandomForestClassfier","LogisticRegression"]
for i in range(len(models)):
    model = models[i]
    model_name = models_name[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: " + model_name)
    print(classification_report(y_test, y_pred))
    print("-" * 30, "\n")

Model: XGBClassifier
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1320
           1       0.50      0.25      0.33        44

    accuracy                           0.97      1364
   macro avg       0.74      0.62      0.66      1364
weighted avg       0.96      0.97      0.96      1364

------------------------------ 

Model: RandomForestClassfier
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1320
           1       0.60      0.20      0.31        44

    accuracy                           0.97      1364
   macro avg       0.79      0.60      0.64      1364
weighted avg       0.96      0.97      0.96      1364

------------------------------ 

Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1320
           1       0.50      0.07      0.12        44

    accuracy                           0