In [1]:
import os
import time
import joblib
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy import stats

RANDOM_STATE = 42 

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, roc_curve, precision_recall_curve,
    average_precision_score, classification_report, confusion_matrix,
    accuracy_score, f1_score, precision_score, recall_score
)

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import shap

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
def load_data(local_path="clean_outliers_data.csv"):
    if os.path.exists(local_path):
        print(f"Loaded dataset from local file: {local_path}")
        df = pd.read_csv(local_path)
        return df

df = load_data()

Loaded dataset from local file: clean_outliers_data.csv


In [8]:
def basic_eda(df):
    print("== Dataset Info ==")
    df.info()
    
    print("\n== Head ==")
    print(df.head())
    
    print("\n== Missing values (top 20) ==")
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    
    if missing.empty:
        print("No missing values detected.")
    else:
        print(missing.head(20))
    
    print("\n== Target distribution ==")
    target_cols = [
        c for c in df.columns
        if any(key in c.lower() for key in ["bankrupt", "class", "label", "target"])
    ]
    
    target_col = target_cols[0] if target_cols else df.columns[-1]
    print(f"Detected target column: {target_col}")
    print(df[target_col].value_counts())
    
    return target_col


In [9]:
target_col = basic_eda(df)

== Dataset Info ==
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5539 entries, 0 to 5538
Data columns (total 12 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 5539 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  5539 non-null   float64
 2    Net Value Per Share (B)                                  5539 non-null   float64
 3    Persistent EPS in the Last Four Seasons                  5539 non-null   float64
 4    Debt ratio %                                             5539 non-null   float64
 5    Borrowing dependency                                     5539 non-null   float64
 6    Working Capital to Total Assets                          5539 non-null   float64
 7    Current Liability to Assets                              5539 non-null   float64
 8  